~5 True/False, ~6 short answer
Topics:
Question types:
dem·o·graph·ics
statistical data relating to the population and particular groups within it.
E.g., age, ethnicity, gender, income, ...
Marketing
Social Media as Surveys
Health
** User profiles vary from site to site. **
# Guessing gender
import configparser
import sys
from TwitterAPI import TwitterAPI
def get_twitter(config_file):
""" Read the config_file and construct an instance of TwitterAPI.
Args:
config_file ... A config file in ConfigParser format with Twitter credentials
Returns:
An instance of TwitterAPI.
"""
config = configparser.ConfigParser()
config.read(config_file)
twitter = TwitterAPI(
config.get('twitter', 'consumer_key'),
config.get('twitter', 'consumer_secret'),
config.get('twitter', 'access_token'),
config.get('twitter', 'access_token_secret'))
return twitter
twitter = get_twitter('twitter.cfg')
tweets = []
n_tweets=1000
for r in twitter.request('statuses/filter', {'track': 'i'}):
tweets.append(r)
if len(tweets) % 100 == 0:
print('%d tweets' % len(tweets))
if len(tweets) >= n_tweets:
break
print('fetched %d tweets' % len(tweets))
100 tweets 200 tweets 300 tweets 400 tweets 500 tweets 600 tweets 700 tweets 800 tweets 900 tweets 1000 tweets fetched 1000 tweets
# not all tweets are returned
# https://dev.twitter.com/streaming/overview/messages-types#limit_notices
[t for t in tweets if 'user' not in t][:6]
[{'limit': {'timestamp_ms': '1475689818824', 'track': 12}}, {'limit': {'timestamp_ms': '1475689818840', 'track': 9}}, {'limit': {'timestamp_ms': '1475689819793', 'track': 89}}, {'limit': {'timestamp_ms': '1475689819825', 'track': 91}}, {'limit': {'timestamp_ms': '1475689819846', 'track': 73}}, {'limit': {'timestamp_ms': '1475689819857', 'track': 99}}]
# restrict to actual tweets
# (remove "deleted" tweets)
tweets = [t for t in tweets if 'user' in t]
print('fetched %d tweets' % len(tweets))
fetched 927 tweets
# Print 10 names.
names = [t['user']['name'] for t in tweets]
names[:10]
['Jeffrey Salas', 'Iberia TV', 'Chelllly', 'selene', 'Hallie Earnhart', 'COSTLIFEDJ💯🐐', 'Aldous Snow', '🔮🔮🔮', 'İÇERDE', 'Madi']
# Fetch census name data from:
# http://www2.census.gov/topics/genealogy/1990surnames/
import requests
from pprint import pprint
males_url = 'http://www2.census.gov/topics/genealogy/' + \
'1990surnames/dist.male.first'
females_url = 'http://www2.census.gov/topics/genealogy/' + \
'1990surnames/dist.female.first'
males = requests.get(males_url).text.split('\n')
females = requests.get(females_url).text.split('\n')
print('males:')
pprint(males[:10])
print('females:')
pprint(females[:10])
males: ['JAMES 3.318 3.318 1', 'JOHN 3.271 6.589 2', 'ROBERT 3.143 9.732 3', 'MICHAEL 2.629 12.361 4', 'WILLIAM 2.451 14.812 5', 'DAVID 2.363 17.176 6', 'RICHARD 1.703 18.878 7', 'CHARLES 1.523 20.401 8', 'JOSEPH 1.404 21.805 9', 'THOMAS 1.380 23.185 10'] females: ['MARY 2.629 2.629 1', 'PATRICIA 1.073 3.702 2', 'LINDA 1.035 4.736 3', 'BARBARA 0.980 5.716 4', 'ELIZABETH 0.937 6.653 5', 'JENNIFER 0.932 7.586 6', 'MARIA 0.828 8.414 7', 'SUSAN 0.794 9.209 8', 'MARGARET 0.768 9.976 9', 'DOROTHY 0.727 10.703 10']
# Get names.
male_names = set([m.split()[0].lower() for m in males if m])
female_names = set([f.split()[0].lower() for f in females if f])
print('%d male and %d female names' % (len(male_names), len(female_names)))
print('males:\n' + '\n'.join(list(male_names)[:10]))
print('\nfemales:\n' + '\n'.join(list(female_names)[:10]))
1219 male and 4275 female names males: courtney lupe christian alfredo jonah rolando samuel luciano cesar vaughn females: lannie lyndsay lupe lolita shayna elnora laurene loree charisse gertrud
# Initialize gender of all tweets to unknown.
for t in tweets:
t['gender'] = 'unknown'
# label a Twitter user's gender by matching name list.
import re
def gender_by_name(tweets, male_names, female_names):
for t in tweets:
name = t['user']['name']
if name:
# remove punctuation.
name_parts = re.findall('\w+', name.split()[0].lower())
if len(name_parts) > 0:
first = name_parts[0].lower()
if first in male_names:
t['gender'] = 'male'
elif first in female_names:
t['gender'] = 'female'
else:
t['gender'] = 'unknown'
gender_by_name(tweets, male_names, female_names)
# What's wrong with this approach?
from collections import Counter
def print_genders(tweets):
counts = Counter([t['gender'] for t in tweets])
print('%.2f of accounts are labeled with gender' %
((counts['male'] + counts['female']) / sum(counts.values())))
print('gender counts:\n', counts)
for t in tweets[:20]:
print(t['gender'], t['user']['name'])
print_genders(tweets)
0.35 of accounts are labeled with gender gender counts: Counter({'unknown': 598, 'female': 178, 'male': 151}) male Jeffrey Salas unknown Iberia TV unknown Chelllly female selene female Hallie Earnhart unknown COSTLIFEDJ💯🐐 unknown Aldous Snow unknown 🔮🔮🔮 unknown İÇERDE unknown Madi female Fiona Von unknown spooky cait unknown -HeartlessLover💋 unknown #NicomaineDeiHardFan female princess 🎃 male Daniel Poehlman unknown savage. unknown 🐨 Lo 🍕 male Isaiah Persons female Barrie
# What about ambiguous names?
def print_ambiguous_names(male_names, female_names):
ambiguous = [n for n in male_names if n in female_names] # names on both lists
print('found %d ambiguous names:\n'% len(ambiguous))
print('\n'.join(ambiguous[:20]))
print_ambiguous_names(male_names, female_names)
found 331 ambiguous names: courtney lupe christian samuel maurice ronald eric toby perry jimmy shannon jan christopher paris cory marshall lou merrill mickey stephen
# Keep names that are more frequent in one gender than the other.
def get_percents(name_list):
# parse raw data to extract, e.g., the percent of males names John.
return dict([(n.split()[0].lower(), float(n.split()[1]))
for n in name_list if n])
males_pct = get_percents(males)
females_pct = get_percents(females)
# Assign a name as male if it is more common among males than femals.
male_names = set([m for m in male_names if m not in female_names or
males_pct[m] > females_pct[m]])
female_names = set([f for f in female_names if f not in male_names or
females_pct[f] > males_pct[f]])
print_ambiguous_names(male_names, female_names)
print('%d male and %d female names' % (len(male_names), len(female_names)))
found 0 ambiguous names: 1146 male and 4017 female names
# Relabel twitter users (compare with above)
gender_by_name(tweets, male_names, female_names)
print_genders(tweets)
0.35 of accounts are labeled with gender gender counts: Counter({'unknown': 598, 'female': 197, 'male': 132}) male Jeffrey Salas unknown Iberia TV unknown Chelllly female selene female Hallie Earnhart unknown COSTLIFEDJ💯🐐 unknown Aldous Snow unknown 🔮🔮🔮 unknown İÇERDE unknown Madi female Fiona Von unknown spooky cait unknown -HeartlessLover💋 unknown #NicomaineDeiHardFan female princess 🎃 male Daniel Poehlman unknown savage. unknown 🐨 Lo 🍕 male Isaiah Persons female Barrie
# Who are the unknowns?
# "Filtered" data can have big impact on analysis.
unknown_names = Counter(t['user']['name']
for t in tweets if t['gender'] == 'unknown')
unknown_names.most_common(20)
[('Pedagog 05447243650', 2), ('WIN PS4 NOW!!', 2), ('Dev 🤘🏾', 2), ('Nihat Osmanlı', 2), ('hasancoco', 2), ('G', 2), ('🔥', 2), ('✨', 2), ('Vikashgarg', 2), ('Brends', 1), ('paipai_devil', 1), ('IG:mrdope_', 1), ('Drones365', 1), ('Henny P. Newton', 1), ('Mehmet boğa', 1), ('.', 1), ('spooky cait', 1), ('kait', 1), ('Rahul Sudeep', 1), ('Kaleidopop', 1)]
# How do the profiles of male Twitter users differ from
# those of female users?
male_profiles = [t['user']['description'] for t in tweets
if t['gender'] == 'male']
female_profiles = [t['user']['description'] for t in tweets
if t['gender'] == 'female']
#male_profiles = [t['text'] for t in tweets
# if t['gender'] == 'male']
#female_profiles = [t['text'] for t in tweets
# if t['gender'] == 'female']
import re
def tokenize(s):
return re.sub('\W+', ' ', s).lower().split() if s else []
male_words = Counter()
female_words = Counter()
for p in male_profiles:
male_words.update(Counter(tokenize(p)))
for p in female_profiles:
female_words.update(Counter(tokenize(p)))
print('Most Common Male Terms:')
pprint(male_words.most_common(10))
print('\nMost Common Female Terms:')
pprint(female_words.most_common(10))
Most Common Male Terms: [('and', 34), ('i', 31), ('the', 21), ('a', 20), ('my', 19), ('of', 17), ('in', 16), ('love', 14), ('with', 13), ('to', 11)] Most Common Female Terms: [('i', 35), ('you', 29), ('my', 25), ('of', 24), ('and', 21), ('a', 19), ('the', 18), ('me', 15), ('to', 15), ('is', 13)]
print(len(male_words))
print(len(female_words))
844 915
# Compute difference
diff_counts = dict([(w, female_words[w] - male_words[w])
for w in
set(female_words.keys()) | set(male_words.keys())])
sorted_diffs = sorted(diff_counts.items(), key=lambda x: x[1])
print('Top Male Terms (diff):')
pprint(sorted_diffs[:10])
print('\nTop Female Terms (diff):')
pprint(sorted_diffs[-10:])
Top Male Terms (diff): [('and', -13), ('with', -10), ('life', -7), ('in', -6), ('too', -6), ('fun', -5), ('words', -4), ('at', -4), ('be', -4), ('enjoy', -4)] Top Female Terms (diff): [('things', 5), ('re', 5), ('my', 6), ('she', 6), ('one', 6), ('m', 6), ('is', 6), ('of', 7), ('me', 8), ('you', 24)]
** A problem with difference of counts:**
What if we have more male than female words in total?
Instead, consider "the probability that a male user writes the word w"
** Odds Ratio (OR)**
The ratio of the probabilities for a word from each class:
$$ OR(w) = \frac{p(w|female)}{p(w|male)} $$def counts_to_probs(gender_words):
""" Compute probability of each term according to the frequency
in a gender. """
total = sum(gender_words.values())
return dict([(word, count / total)
for word, count in gender_words.items()])
male_probs = counts_to_probs(male_words)
female_probs = counts_to_probs(female_words)
print('p(w|male)')
pprint(sorted(male_probs.items(), key=lambda x: -x[1])[:10])
print('\np(w|female)')
pprint(sorted(female_probs.items(), key=lambda x: -x[1])[:10])
p(w|male) [('and', 0.025660377358490565), ('i', 0.02339622641509434), ('the', 0.015849056603773583), ('a', 0.01509433962264151), ('my', 0.014339622641509434), ('of', 0.012830188679245283), ('in', 0.012075471698113207), ('love', 0.010566037735849057), ('with', 0.009811320754716982), ('to', 0.00830188679245283)] p(w|female) [('i', 0.023458445040214475), ('you', 0.01943699731903485), ('my', 0.01675603217158177), ('of', 0.0160857908847185), ('and', 0.014075067024128687), ('a', 0.012734584450402145), ('the', 0.012064343163538873), ('me', 0.010053619302949061), ('to', 0.010053619302949061), ('is', 0.00871313672922252)]
def odds_ratios(male_probs, female_probs):
return dict([(w, female_probs[w] / male_probs[w])
for w in
set(male_probs) | set(female_probs)])
ors = odds_ratios(male_probs, female_probs)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-110-6f72aa3012f6> in <module>() 3 for w in set(male_probs) | set(female_probs)]) 4 ----> 5 ors = odds_ratios(male_probs, female_probs) <ipython-input-110-6f72aa3012f6> in odds_ratios(male_probs, female_probs) 1 def odds_ratios(male_probs, female_probs): 2 return dict([(w, female_probs[w] / male_probs[w]) ----> 3 for w in set(male_probs) | set(female_probs)]) 4 5 ors = odds_ratios(male_probs, female_probs) <ipython-input-110-6f72aa3012f6> in <listcomp>(.0) 1 def odds_ratios(male_probs, female_probs): 2 return dict([(w, female_probs[w] / male_probs[w]) ----> 3 for w in set(male_probs) | set(female_probs)]) 4 5 ors = odds_ratios(male_probs, female_probs) KeyError: 'rock'
print(len(male_probs))
print(len(female_probs))
print(female_probs['rock'])
'rock' in male_probs
844 915 0.0013404825737265416
False
** How to deal with 0-probabilities? **
$$p(w|male) = \frac{freq(w, male)} {\sum_i freq(w_i, male)} $$$freq(w, male) = 0$
Do we really believe there is 0 probability of a male using this term?
(Recall over-fitting discussion.)
** Additive Smoothing **
Reserve small amount of counts (e.g., 1) for unseen observations.
E.g., assume we've seen each word at least once in each class.
$$p(w|male) = \frac{1 + freq(w, male)} {|W| + \sum_i freq(w_i, male)} $$$|W|$: number of unique words.
# Additive smoothing. Add count of 1 for all words.
all_words = set(male_words) | set(female_words)
male_words.update(all_words)
female_words.update(all_words)
male_probs = counts_to_probs(male_words)
female_probs = counts_to_probs(female_words)
print('\n'.join(str(x) for x in
sorted(male_probs.items(), key=lambda x: -x[1])[:10]))
('and', 0.012114918656974732) ('i', 0.01107649705780547) ('the', 0.00761509172724126) ('a', 0.007268951194184839) ('my', 0.006922810661128418) ('of', 0.006230529595015576) ('in', 0.005884389061959155) ('love', 0.005192107995846314) ('with', 0.004845967462789893) ('to', 0.004153686396677051)
# Even though word doesn't appear, has non-zerp probability.
print(male_probs['rock'])
0.00034614053305642093
ors = odds_ratios(male_probs, female_probs)
sorted_ors = sorted(ors.items(), key=lambda x: -x[1])
print('Top Female Terms (OR):')
pprint(sorted_ors[:20])
print('\nTop Male Terms (OR):')
pprint(sorted_ors[-20:])
Top Female Terms (OR): [('she', 6.6174738219895275), ('one', 6.6174738219895275), ('19', 5.672120418848166), ('student', 5.672120418848166), ('things', 5.672120418848166), ('right', 4.726767015706806), ('towards', 4.726767015706806), ('reaction', 4.726767015706806), ('first', 4.726767015706806), ('you', 4.726767015706806), ('bayern', 4.726767015706806), ('insta', 4.726767015706806), ('from', 3.781413612565445), ('those', 3.781413612565445), ('jo', 3.781413612565445), ('technology', 3.781413612565445), ('forever', 3.781413612565445), ('we', 3.781413612565445), ('follow', 3.781413612565445), ('aliciagworld', 3.781413612565445)] Top Male Terms (OR): [('fan', 0.31511780104712045), ('youtu', 0.31511780104712045), ('when', 0.31511780104712045), ('football', 0.31511780104712045), ('game', 0.31511780104712045), ('gamer', 0.31511780104712045), ('rest', 0.31511780104712045), ('with', 0.27010097232610325), ('sports', 0.23633835078534032), ('father', 0.23633835078534032), ('person', 0.23633835078534032), ('far', 0.23633835078534032), ('know', 0.23633835078534032), ('drummer', 0.23633835078534032), ('beer', 0.23633835078534032), ('mother', 0.23633835078534032), ('words', 0.18907068062827226), ('enjoy', 0.18907068062827226), ('fun', 0.15755890052356022), ('too', 0.13505048616305163)]