Often, we want to know which features appear together.
Can be used to summarize a large collection of messages.
We'll use k-means to cluster together related words from Twitter.
Caution: This uses live Twitter data, which often contains profanity.
# Get some tweets containing the word 'i'.
import os
from TwitterAPI import TwitterAPI
# Read Twitter credentials from environmental variables.
api = TwitterAPI(os.environ.get('TW_CONSUMER_KEY'),
os.environ.get('TW_CONSUMER_SECRET'),
os.environ.get('TW_ACCESS_TOKEN'),
os.environ.get('TW_ACCESS_TOKEN_SECRET'))
# Collect 10000 tweets.
tweets = []
while True:
r = api.request('statuses/filter', {'track':'i',
'language':'en'})
if r.status_code != 200: # error
break
else:
for item in r.get_iterator():
tweets.append(item)
if len(tweets) > 10000:
break
elif len(tweets) % 100 == 0:
print(len(tweets))
100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000
print(len(tweets))
10002
# Each tweet is a Python dict.
print('text', tweets[0]['text'])
print('description:', tweets[0]['user']['description'])
print('name:', tweets[0]['user']['name'])
print('location:', tweets[0]['user']['location'])
text @KYFriedComrade im rereading it ....maybe it is about twitter i dunno anymore i give up lol description: gringa, adoptive santiaguina, ADHD stoner lela lez left stream of consciousness commentariat + retweets of dope ppl ✊🤣😎😘 name: naty 🖤🧡💚 location: Santiago, Chile
tweets = [t for t in tweets if 'text' in t]
len(tweets)
9806
# Tokenize each tweet text.
import re
tokens = []
for tweet in tweets:
text = tweet['text'].lower()
text = re.sub('@\S+', ' ', text) # Remove mentions.
text = re.sub('http\S+', ' ', text) # Remove urls.
tokens.append(re.findall('[A-Za-z]+', text)) # Retain words.
tokens[0]
['im', 'rereading', 'it', 'maybe', 'it', 'is', 'about', 'twitter', 'i', 'dunno', 'anymore', 'i', 'give', 'up', 'lol']
# Count words.
from collections import Counter
word_counts = Counter()
for tweet in tokens:
word_counts.update(tweet)
# Inspect word counts.
import math
print(len(word_counts), 'unique terms')
word_counts.most_common(10)
13183 unique terms
[('i', 11474), ('rt', 5231), ('the', 3741), ('to', 3629), ('a', 2863), ('and', 2425), ('you', 2414), ('my', 2104), ('it', 1820), ('this', 1816)]
# Retain in vocabulary words occurring more than twice.
vocab = set([w for w, c in word_counts.items() if c > 2])
print('%d words occur at least three times.' % len(vocab))
4077 words occur at least three times.
# Prune tokens.
newtoks = []
for i, tweet in enumerate(tokens):
newtok = [token for token in tweet if token in vocab]
if len(newtok) > 0:
newtoks.append(newtok)
tokens = newtoks
# A sample pruned tweet.
tokens[0]
['im', 'it', 'maybe', 'it', 'is', 'about', 'twitter', 'i', 'anymore', 'i', 'give', 'up', 'lol']
tokens[2]
['rt', 'from', 'the', 'bottom', 'of', 'my', 'heart', 'i', 'hope', 'is', 'a', 'better', 'mental', 'health', 'year', 'for', 'everyone', 'lt']
Context features
To determine if two words are similar, we will create a feature vector that counts how often other words appear nearby.
E.g.,
I really love school.
I really like school.
You love school.
love: {really@-1: 1, school@1: 2, you@-1: 1}
like: {really@-1: 1, school@1: 1}
Assumption: words with similar meaning have similar contexts vectors.
import numpy as np
def get_contexts(tweet, i, window):
"""
Get the context features for token at position i
in this tweet, using the given window size.
"""
features = []
for j in range(np.amax([0, i-window]), i):
features.append(tweet[j] + "@" + str(j-i))
for j in range(i+1, min(i + window + 1, len(tweet))):
features.append(tweet[j] + "@" + str(j-i))
return features
print('context for word %s in %s' % (tokens[0][6], tokens[0]))
print(get_contexts(tokens[0], i=3, window=2))
context for word twitter in ['im', 'it', 'maybe', 'it', 'is', 'about', 'twitter', 'i', 'anymore', 'i', 'give', 'up', 'lol'] ['it@-2', 'maybe@-1', 'is@1', 'about@2']
** Q: How would the approach differ if we ignore location of context?**
E.g., love: {really: 1, school:1, you: 1} vs {really@-1: 1, school@1: 1, you@-1: 1}
# For each term, create a context vector, indicating how often
# each word occurs to the left or right of it.
from collections import defaultdict
import numpy as np
# dict from term to context vector.
contexts = defaultdict(lambda: Counter())
window = 2
for tweet in tokens:
for i, token in enumerate(tweet):
features = get_contexts(tweet, i, window)
contexts[token].update(features)
# Optionally: ignore word order
# contexts[token].update(tweet[:i] + tweet[i+1:])
contexts['i'].most_common(20)
[('rt@-1', 1823), ('m@1', 1667), ('t@2', 798), ('rt@-2', 529), ('and@-1', 444), ('to@2', 433), ('love@1', 424), ('a@2', 418), ('have@1', 412), ('don@1', 398), ('am@1', 374), ('can@1', 361), ('you@2', 359), ('ll@1', 334), ('ve@1', 326), ('when@-1', 323), ('just@1', 320), ('but@-1', 277), ('was@1', 276), ('the@2', 262)]
tf-idf vectors
# Compute the number of different contexts each term appears in.
# Actually: this is the total number of times this context feature appears.
tweet_freq = Counter()
for context in contexts.values():
tweet_freq.update(context)
tweet_freq.most_common(5)
[('i@-1', 11338), ('i@-2', 10905), ('i@1', 9485), ('i@2', 7121), ('rt@-1', 5217)]
Counter(tweet_freq.values())
Counter({1766: 1, 25: 71, 109: 8, 55: 24, 772: 1, 88: 12, 5200: 1, 306: 1, 2381: 1, 3436: 1, 34: 45, 7: 842, 412: 2, 106: 8, 649: 1, 211: 2, 44: 32, 5: 1369, 1032: 1, 2021: 1, 903: 1, 4: 2130, 2335: 2, 3249: 1, 1188: 1, 2850: 1, 1377: 1, 670: 1, 32: 43, 7121: 1, 62: 19, 27: 62, 2216: 1, 5217: 1, 14: 224, 35: 41, 60: 18, 320: 1, 10: 415, 815: 2, 219: 1, 3558: 1, 40: 32, 118: 8, 862: 1, 75: 14, 6: 1045, 10905: 1, 99: 7, 97: 6, 140: 4, 385: 3, 424: 2, 131: 3, 201: 4, 33: 42, 114: 7, 91: 5, 45: 26, 21: 102, 1453: 1, 119: 4, 78: 10, 65: 25, 468: 1, 258: 1, 144: 1, 52: 26, 585: 1, 1447: 1, 1: 240, 158: 2, 103: 10, 116: 7, 214: 2, 92: 11, 199: 3, 59: 17, 8: 669, 410: 1, 584: 1, 16: 192, 79: 13, 510: 2, 367: 1, 41: 48, 1652: 1, 1807: 1, 94: 11, 113: 4, 3: 3235, 31: 59, 586: 1, 1196: 1, 133: 3, 173: 1, 20: 132, 3656: 1, 503: 1, 2202: 1, 15: 203, 390: 3, 911: 1, 227: 3, 185: 4, 574: 1, 2: 831, 803: 1, 46: 23, 453: 1, 66: 17, 19: 132, 2037: 1, 28: 70, 73: 15, 64: 13, 2771: 1, 93: 7, 1270: 1, 490: 1, 146: 3, 1305: 1, 905: 1, 653: 1, 149: 5, 161: 5, 9485: 1, 80: 12, 49: 14, 23: 89, 84: 9, 83: 12, 9: 454, 1475: 1, 1036: 1, 129: 6, 293: 2, 321: 1, 2452: 1, 102: 3, 70: 14, 12: 301, 37: 47, 1480: 1, 297: 3, 61: 12, 966: 1, 69: 9, 1898: 1, 1522: 1, 48: 27, 148: 3, 403: 1, 484: 2, 1629: 1, 215: 4, 18: 123, 13: 259, 17: 172, 162: 3, 105: 5, 176: 1, 192: 2, 1328: 1, 24: 81, 58: 21, 29: 65, 1194: 1, 57: 22, 206: 2, 353: 1, 873: 1, 11: 369, 122: 6, 77: 14, 226: 3, 30: 64, 76: 13, 1645: 1, 39: 34, 1432: 1, 445: 1, 51: 12, 26: 68, 743: 1, 438: 3, 627: 1, 1363: 1, 1122: 1, 197: 3, 1624: 1, 451: 1, 619: 1, 640: 1, 301: 1, 1372: 1, 67: 15, 180: 3, 1366: 1, 442: 1, 536: 1, 661: 1, 81: 10, 47: 33, 22: 100, 1219: 1, 648: 1, 72: 11, 123: 2, 407: 1, 187: 3, 339: 1, 50: 16, 1322: 1, 449: 1, 152: 6, 63: 22, 1548: 1, 43: 32, 901: 1, 345: 1, 881: 1, 479: 1, 183: 2, 71: 15, 364: 4, 333: 1, 387: 1, 589: 1, 147: 5, 174: 2, 355: 2, 242: 2, 181: 1, 909: 1, 1027: 1, 446: 2, 858: 1, 2170: 1, 492: 2, 233: 2, 179: 2, 89: 7, 1547: 1, 107: 4, 124: 5, 139: 3, 885: 1, 402: 2, 101: 7, 86: 12, 3542: 1, 157: 1, 245: 1, 291: 2, 324: 1, 150: 5, 2046: 1, 216: 2, 299: 2, 3620: 1, 621: 1, 505: 1, 56: 14, 400: 1, 172: 3, 110: 1, 325: 3, 104: 5, 126: 4, 177: 3, 1268: 1, 1204: 1, 130: 5, 504: 1, 3184: 1, 134: 2, 204: 2, 1406: 1, 1407: 1, 121: 3, 285: 2, 738: 1, 924: 1, 283: 1, 38: 35, 1477: 1, 53: 14, 362: 1, 1282: 1, 704: 1, 125: 9, 155: 2, 141: 1, 686: 1, 85: 13, 383: 1, 2700: 1, 839: 2, 570: 1, 347: 1, 210: 3, 82: 6, 365: 2, 282: 1, 357: 2, 54: 10, 1066: 1, 170: 1, 189: 2, 222: 1, 545: 1, 317: 1, 377: 1, 356: 1, 778: 1, 87: 9, 154: 2, 635: 1, 376: 1, 68: 12, 136: 3, 265: 3, 213: 4, 253: 2, 287: 1, 1329: 1, 184: 4, 108: 8, 239: 1, 397: 2, 74: 5, 167: 2, 255: 1, 487: 1, 1595: 1, 90: 3, 1452: 1, 135: 2, 1677: 1, 432: 1, 1709: 1, 200: 5, 250: 2, 525: 2, 883: 1, 42: 17, 188: 3, 138: 4, 414: 1, 169: 3, 723: 2, 137: 3, 1737: 1, 127: 5, 194: 1, 768: 1, 304: 1, 511: 1, 932: 1, 163: 3, 593: 1, 120: 1, 481: 1, 1042: 1, 708: 1, 143: 10, 145: 3, 153: 4, 337: 1, 363: 1, 515: 2, 11338: 1, 117: 6, 430: 1, 100: 7, 112: 7, 343: 1, 132: 3, 241: 4, 1283: 1, 844: 1, 166: 2, 352: 1, 472: 1, 394: 2, 234: 3, 220: 1, 1467: 1, 3582: 1, 164: 1, 368: 2, 171: 2, 98: 8, 483: 1, 1324: 1, 96: 5, 678: 1, 36: 29, 263: 1, 354: 1, 286: 1, 142: 2, 228: 1, 378: 1, 278: 2, 361: 2, 539: 1, 159: 1, 281: 1, 186: 1, 208: 3, 346: 1, 209: 2, 2345: 1, 195: 1, 379: 1, 259: 2, 160: 1, 202: 1, 115: 1, 277: 2, 745: 1, 623: 1, 319: 1, 274: 2, 212: 1, 232: 1, 657: 1, 328: 1, 348: 2, 370: 1, 221: 1, 331: 1, 646: 1, 332: 1, 225: 1, 554: 1, 360: 1, 271: 1, 231: 1, 251: 1, 244: 2, 452: 1, 436: 1, 637: 1, 266: 1, 128: 2, 165: 1, 111: 4, 230: 1, 302: 1, 312: 1, 1683: 1, 899: 1, 193: 1, 1568: 1, 551: 1, 289: 1, 660: 1, 1774: 1, 156: 2, 532: 1, 429: 1, 249: 1, 295: 1, 606: 2, 493: 1, 405: 2, 409: 1, 466: 1, 393: 1, 528: 1, 168: 1, 513: 1, 151: 1, 330: 1, 191: 1, 229: 1, 238: 1, 95: 1, 175: 1})
# As opposed to the above, this computes the number of unique terms that this feature
# appears in. Q: How do you expect to affect the output?
tweet_freq_2 = Counter()
for context in contexts.values():
tweet_freq_2.update(context.keys())
tweet_freq_2.most_common(5)
[('i@2', 1335), ('i@1', 1331), ('i@-2', 1296), ('the@-1', 1087), ('and@1', 978)]
# Transform each context vector to be term freq / tweet frequency.
# Also then normalize by length.
for term, context in contexts.items():
for term2, frequency in context.items():
# tf / [ 1 + log(df) ]
context[term2] = frequency / (1. + math.log(tweet_freq[term2]))
length = math.sqrt(sum([v*v for v in context.values()]))
for term2, frequency in context.items():
context[term2] = 1. * frequency / length
contexts['i'].most_common(5)
[('m@1', 0.4992255422401076), ('rt@-1', 0.4843410432712745), ('t@2', 0.24288406965963458), ('love@1', 0.14382229255205484), ('rt@-2', 0.1405945803985488)]
contexts['school'].most_common(10)
[('worthless@-2', 0.33764514643816557), ('high@-1', 0.2816246484908819), ('holding@2', 0.2323291846302529), ('at@-1', 0.2204425813331341), ('to@-1', 0.20596171539422523), ('son@-2', 0.18688995834710687), ('and@1', 0.18452879080306742), ('her@2', 0.16723062126872543), ('in@-2', 0.16473427852389158), ('uneducated@1', 0.15939656236107255)]
contexts['love'].most_common(10)
[('i@-1', 0.8517303851828948), ('you@1', 0.341304167014421), ('rt@-2', 0.14556771763853768), ('so@2', 0.11603489382180723), ('i@-2', 0.11090169450417534), ('this@1', 0.09080734081934172), ('u@1', 0.08118539291674556), ('it@1', 0.080832127605461), ('in@-1', 0.07776084336903491), ('i@2', 0.06941393383495353)]
contexts['hate'].most_common(10)
[('i@-1', 0.6736723361573105), ('grocery@1', 0.3146058368934483), ('store@2', 0.2657744340570913), ('fucking@-1', 0.23378948537609445), ('pressure@2', 0.23209310502084446), ('the@1', 0.2063211337802833), ('i@-2', 0.18442359838340952), ('rt@-2', 0.17387587244484196), ('amp@-2', 0.141357574805482), ('me@1', 0.07726901404021158)]
At this point we have a list of dictionaries, one per term, indicating the terms that co-occur (weighted by inverse tweet frequency).
Next, we have to cluster these vectors. To do this, we'll need to be able to compute the euclidean distance between two vectors.
# n.b. This is not efficient!
def distance(c1, c2):
if len(c1.keys()) == 0 or len(c2.keys()) == 0:
return 1e9
keys = set(c1.keys()) | set(c2.keys())
distance = 0.
for k in keys:
distance += (c1[k] - c2[k]) ** 2
return math.sqrt(distance)
print(distance({'hi':10, 'bye': 5}, {'hi': 9, 'bye': 4}))
print(distance({'hi':10, 'bye': 5}, {'hi': 8, 'bye': 4}))
1.4142135623730951 2.23606797749979
def find_closest(term, n=5):
terms = np.array(list(contexts.keys()))
context = contexts[term]
distances = []
for term2, context2 in contexts.items():
distances.append(distance(context, context2))
return terms[np.argsort(distances)][:n]
find_closest('love', n=10)
array(['love', 'hope', 'miss', 'm', 'am', 'guess', 'll', 'mean', 'cant', 'wish'], dtype='<U23')
nz_contexts = [t for t, context in contexts.items()
if len(context) > 1]
contexts = dict([(term, contexts[term]) for term in nz_contexts])
print(len(nz_contexts), 'nonzero contexts')
4077 nonzero contexts
# e.g., what are three context features for the term "rt"?
print(list(contexts.keys())[0])
print(list(list(contexts.values())[0].items())[:3])
im [('it@1', 0.01264806377201966), ('maybe@2', 0.025412211140884603), ('y@-2', 0.01883753470306806)]
# Transform context dicts to a sparse vector
# for sklearn.
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
X = vec.fit_transform(contexts.values())
names = np.array(vec.get_feature_names())
print(names[:10])
print(X[0])
['a@-1' 'a@-2' 'a@1' 'a@2' 'aaaaaa@-1' 'aaaaaa@-2' 'aaaaaa@1' 'aaaaaa@2' 'aaron@-1' 'aaron@-2'] (0, 1) 0.012176616904934769 (0, 2) 0.10774881812308722 (0, 3) 0.08406853449098409 (0, 234) 0.02946119105248044 (0, 265) 0.024956944731107572 (0, 332) 0.02065859263633043 (0, 368) 0.014543761318494223 (0, 369) 0.014329995051336608 (0, 393) 0.02368590894509358 (0, 413) 0.01978485270160371 (0, 434) 0.018043994507438255 (0, 436) 0.015269990983388075 (0, 475) 0.015840112090628022 (0, 480) 0.01604913069072621 (0, 483) 0.036733833080258765 (0, 485) 0.02443477905749157 (0, 486) 0.04897844410701169 (0, 611) 0.04022372022569253 (0, 641) 0.03840265052217021 (0, 880) 0.05818855015204386 (0, 881) 0.02916261855721708 (0, 1053) 0.01678036618972246 (0, 1063) 0.020161956350368875 (0, 1206) 0.0446093161177403 (0, 1209) 0.022203132192976167 : : (0, 14849) 0.04492780311830858 (0, 14975) 0.4561416854630144 (0, 15102) 0.019454490377035092 (0, 15137) 0.03246274061678437 (0, 15216) 0.018578551803747474 (0, 15217) 0.018633750350250212 (0, 15224) 0.01978485270160371 (0, 15490) 0.014819584801508275 (0, 15491) 0.015207803801313627 (0, 15502) 0.3141806233384448 (0, 15532) 0.03659194385094082 (0, 15671) 0.013794628438236247 (0, 15735) 0.04108584562516659 (0, 15836) 0.024359499004844704 (0, 15870) 0.025924583046390885 (0, 15873) 0.024548591946210438 (0, 15882) 0.022409430110929447 (0, 15893) 0.01883753470306806 (0, 15897) 0.02417935783176741 (0, 15944) 0.01820112202264155 (0, 15971) 0.051086598423935724 (0, 16003) 0.012327178077380431 (0, 16004) 0.0497998532898863 (0, 16006) 0.03695460482404303 (0, 16059) 0.03639315451392994
# Which row of X is the word "love"?
love_idx = list(contexts.keys()).index('love')
print(love_idx)
# What are the context feature values for love?
print(X[love_idx])
# Print a highly ranking feature.
print(names[15534])
56 (0, 0) 0.01166315589869699 (0, 1) 0.009432626921748103 (0, 2) 0.0069556430065184915 (0, 3) 0.020932638324993653 (0, 12) 0.009893569523380101 (0, 28) 0.0029242019578079237 (0, 29) 0.005935227260208678 (0, 31) 0.005850253394142066 (0, 40) 0.00502062522553737 (0, 134) 0.01569827105681456 (0, 136) 0.00393482667128263 (0, 181) 0.008700840482568875 (0, 219) 0.009893569523380101 (0, 233) 0.0058241406815781925 (0, 241) 0.003442994829099182 (0, 251) 0.003660037555588828 (0, 278) 0.0057055343458282184 (0, 334) 0.0038489132041543044 (0, 367) 0.002790339436949068 (0, 368) 0.008449753057068984 (0, 369) 0.002775185795961421 (0, 370) 0.019557649156184565 (0, 402) 0.005416543436788495 (0, 410) 0.009893569523380101 (0, 412) 0.003966821569702731 : : (0, 15787) 0.004102954333698908 (0, 15828) 0.03718458281719579 (0, 15829) 0.0031113083765471677 (0, 15831) 0.006289728235394356 (0, 15847) 0.022014344111415302 (0, 15857) 0.005957911837271409 (0, 15863) 0.004299918936961106 (0, 15871) 0.00502062522553737 (0, 15885) 0.004193444669798336 (0, 15894) 0.007005863991464066 (0, 15895) 0.003619603953271363 (0, 15946) 0.003511581004101353 (0, 15969) 0.020762766580557777 (0, 15971) 0.009893569523380101 (0, 15990) 0.004428087090688781 (0, 16003) 0.01193657408187391 (0, 16004) 0.012055468703578194 (0, 16005) 0.341304167014421 (0, 16006) 0.035783646133738856 (0, 16014) 0.004876067603250058 (0, 16019) 0.011392909025650088 (0, 16020) 0.002893415638699762 (0, 16021) 0.031212968706279383 (0, 16022) 0.0028439084729393136 (0, 16060) 0.007437161692980253 while@1
# Let's cluster!
# http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
from sklearn.cluster import KMeans
num_clusters = 20
kmeans = KMeans(num_clusters)
kmeans.fit(X)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)
# Let's print out the top features for each mean vector.
# This is swamped by common terms
for i in range(num_clusters):
print(i, ' '.join(names[np.argsort(
kmeans.cluster_centers_[i])[::-1][:5]]))
0 to@1 i@-2 i@-1 be@2 the@2 1 a@-1 i@1 i@2 a@-2 the@-1 2 the@1 up@1 i@-2 i@2 my@1 3 my@-1 and@1 i@2 the@-1 i@1 4 i@-2 m@-1 i@1 m@-2 for@1 5 of@1 the@-1 the@2 a@-1 a@-2 6 it@-1 so@-2 much@-1 many@-1 i@1 7 rt@-1 i@2 i@1 and@1 is@1 8 in@1 the@2 i@-2 my@2 the@-1 9 i@-1 rt@-2 i@-2 you@1 it@1 10 the@2 but@1 i@2 on@1 be@-1 11 i@1 m@2 rt@-1 i@2 i@-2 12 me@1 and@-1 a@1 i@-2 i@2 13 t@1 i@-1 you@-1 be@2 rt@-2 14 with@1 my@2 i@-2 the@2 to@-1 15 i@2 i@1 and@1 the@-2 a@-2 16 to@-1 i@-2 i@-1 t@-1 i@2 17 the@-1 in@-2 i@1 i@2 of@-2 18 i@-2 i@2 just@-1 was@-1 and@2 19 of@-1 and@1 i@1 the@-2 the@-1
# .transform will compute the distance from each context to each cluster.
distances = kmeans.transform(X)
# e.g., what is the distance from the word "love" to each cluster?
print('distance from term "love" to each cluster:')
print(distances[love_idx])
# what is the closest cluster for the word "love"?
print('closest cluster to "love":')
print(np.argmin(distances[love_idx]))
distance from term "love" to each cluster: [1.00874162 1.0266301 0.97164003 1.03189144 1.03358099 1.04555106 0.9921334 1.0001127 0.99631078 0.5906229 0.98516281 1.01236052 0.96816683 1.1055978 0.99184346 0.98134482 0.95310019 1.02250443 0.95707704 1.00474645] closest cluster to "love": 9
# Finally, we'll print the words that are closest
# to the mean of each cluster.
terms = np.array(list(contexts.keys()))
for i in range(distances.shape[1]):
print(i, ' '.join(terms[np.argsort(distances[:,i])[1:10]]), '\n')
0 supposed listening going wanted used decided listen able needs 1 good few bit wrap couple dream virgin requirement little 2 for on of at with all is during from 3 favorite face hair life heart boyfriend wife brother head 4 not screaming sorry dying doin scared gonna assuming sobbing 5 part rest one majority type habit tired instead benefit 6 everyday aaaaaa whew and but yard sounds things seemed 7 hello hi okay i mingyu nah psa mutual age 8 participate doctor punched zoomed manga engage lived live jump 9 am love hope just ll have guess mean miss 10 shade one me pumped this and okay that here 11 but when because and omg what before where ok 12 with tell making call is help on in make 13 didn wasn ain wouldn couldn haven doesn isn can 14 ruv communicate agree familiar wrong flights along interact dealing 15 that me time but the day this my is 16 be see make do say save hear go cry 17 way best world weekends first saddest waist bathroom bus 18 a never like you sorry u done to so 19 us mone nowhere the concerning them luck people prof
Clearly, interpreting these results requires a bit of investigation.
As the number of tweets increases, we expect these clusters to become more coherent.
How does error decrease with number of cluster?
kmeans.score(X)
-3727.0865918080995
import matplotlib.pyplot as plt
%matplotlib inline
scores = []
num_cluster_options = [5,10,20,50,100]
for num_clusters in num_cluster_options:
kmeans = KMeans(num_clusters, n_init=10, max_iter=10)
kmeans.fit(X)
score = -1 * kmeans.score(X)
scores.append(score)
print('k=%d score=%g' % (num_clusters, score))
plt.figure()
plt.plot(num_cluster_options, scores, 'bo-')
plt.xlabel('num clusters')
plt.ylabel('error')
plt.show()
k=5 score=3860.63 k=10 score=3786.09 k=20 score=3722.77 k=50 score=3638.46 k=100 score=3553.49
** How does error vary by initalization? **
scores = []
for i in range(10):
kmeans = KMeans(20, n_init=1,
max_iter=10,
init='random')
kmeans.fit(X)
score = -1 * kmeans.score(X)
scores.append(score)
print('score=%g' % (score))
plt.figure()
plt.plot(range(10), sorted(scores), 'bo-')
plt.xlabel('sample')
plt.ylabel('error')
plt.show()
score=3758.22 score=3747.06 score=3741.92 score=3749.15 score=3741.44 score=3752.9 score=3739.67 score=3748.15 score=3744.9 score=3741.4
We now have a way to represent words in 20-dimensional space.
def get_distances(word, contexts, distances):
wd_idx = list(contexts.keys()).index(word)
return distances[wd_idx]
print(get_distances('love', contexts, distances))
print(get_distances('like', contexts, distances))
print(get_distances('hate', contexts, distances))
print(get_distances('pizza', contexts, distances))
[1.00874162 1.0266301 0.97164003 1.03189144 1.03358099 1.04555106 0.9921334 1.0001127 0.99631078 0.5906229 0.98516281 1.01236052 0.96816683 1.1055978 0.99184346 0.98134482 0.95310019 1.02250443 0.95707704 1.00474645] [0.94998976 0.99347211 0.92797163 1.00760459 0.87369431 1.02867307 0.97299124 0.97489501 0.97819442 0.89745615 0.95924255 0.90359807 0.93594526 1.2176145 0.98537882 0.96513233 0.93622821 0.9995155 0.85784706 0.98290799] [1.01027941 1.0360027 0.94782441 1.04189133 1.02496428 1.05721972 1.00490278 1.00783092 1.00071431 0.71358098 0.99042856 1.02849363 0.97045892 1.14231512 1.01697286 0.98572949 0.96373614 1.03701544 0.95307083 1.00673499] [1.0771791 1.0382791 1.01844359 1.04147931 1.09082894 1.06598877 1.00127222 1.01626417 1.02439718 1.09741815 0.99488921 1.00199894 0.99562545 1.28351384 1.04215818 0.98982216 1.04985219 1.04174968 1.01364399 1.01655176]
We can use these vectors to compute how similar two words are.
from math import sqrt
def sim(v1, v2):
""" cosine similarity of two vectors. """
return np.dot(v1, v2) / (sqrt(np.dot(v1, v1)) * sqrt(np.dot(v2,v2)))
# FIXME: sqrt norm
print(sim(get_distances('love', contexts, distances),
get_distances('like', contexts, distances)))
0.9958552518939588
print(sim(get_distances('love', contexts, distances),
get_distances('hate', contexts, distances)))
0.9995853218896937
print(sim(get_distances('love', contexts, distances),
get_distances('pizza', contexts, distances)))
0.993983274406897
So, love
is more similar to like
than to pizza
.
However, this approach treats each word the same when computing similarity.
Presumably, some context words are more important than others (e.g., the
versus hippopotamus
).
tf-idf
captures this to some extent.
Can we use machine learning to weight features based on how predictive they are?
See Word2Vec.ipynb
#
from IPython.core.display import HTML
HTML(open('../custom.css').read())