from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim
import string
import numpy as np
import scipy as sp
from scipy import stats
%pylab inline
Populating the interactive namespace from numpy and matplotlib
Create a profile with only the basic and generic information. The identity setting is to distinguish it from the commonly seen 20-somethings US profiles. So we can learn later on wether the system is giving results just based on my preferences and search criteria, or it's still customizing the results based on my profile.
I'm looking for:
Open general browse page and set search criteria.
Straight Male:
Straight Female:
Bisexual All Gender:
Copy the search page URL, and login response cookies to the code.
# This function did not work for some reasons. I have to manually put my browser cookies.
def getCookies():
req = requests.post('https://www.okcupid.com/login',\
data={'login_username': 'uhohcantletyouknow', 'login_password':'uhohcantletyouknow'})
cookies = req.cookies
return cookies
cookies = cookies
# get a maximum of 30 usernames from one search
def getUsernames():
url = 'http://www.okcupid.com/match?filter1=0,48&filter2=2,100,18&filter3=5,26\
78400&filter4=1,1&locid=0&timekey=1&matchOrderBy=SPECIAL_BLEND&custom_search=0&\
fromWhoOnline=0&mygender=m&update_prefs=1&sort_type=0&sa=1&using_saved_search=&count=30'
page = requests.get(url, cookies = cookies).text
soup = BeautifulSoup(page, 'html5lib')
result = soup.find_all('div', {'class':'match_card_wrapper user-not-hidden '})
roughnames = [i.get('id') for i in result]
usernames = [re.findall('usr-(.*)-wrapper', i)[0] for i in roughnames]
return usernames
# repeat the above search multiple times
def getLotsUsernames():
usernames = []
for i in range(1000):
# 1000 pages * 30 usernames per page = about 30,000 usernames
usernames += getUsernames()
print 'Scraped', i, 'of 1000 targeted pages.'
unique = set(usernames)
print 'Downloaded %d usernames, of which %d are unique.' % (len(usernames), len(unique))
return unique
The usernames are used to generate unique user profile URL. Scraping 300 pages (9,000 usernames not including duplicates) for each gender takes over an hour. Just to be safe, here I saved the scraped usernames into csv files.
Scraping part is done on my local server for speed purpose.
Downloaded 30,000 usernames, of which 20,565 are unique. Unique percentage: 68.55%
Downloaded 30,000 usernames, of which 24,195 are unique. Unique percentage: 80.65%
Downloaded 300,000 usernames, of which 23,565 are unique. Unique percentage: 7.855%
%%time
usernames = getLotsUsernames()
usernames = list(usernames)
for i in range(len(usernames)):
usernames[i] = usernames[i].encode('utf-8')
# write usernames into new file
def writeUsernames(usernames):
string = ''
for i in usernames:
string += i+'\n'
with open('usernames.txt', 'w') as f:
f.write(string)
print len(usernames), 'of usernames have been written into usernames.txt.'
# rewrite the file with unique usernames if redundancies are found after multiple scraping attempts
def appendUsernames(usernames):
string = ''
for i in usernames:
string += i+'\n'
with open('usernames.txt', 'a') as f:
f.write(string)
print len(usernames), 'of usernames have been added into usernames.txt.'
appendUsernames(usernames)
plt.rcParams['figure.figsize'] = 6,6
a = {'Straight Male': 2054, 'Straight Female':2412, 'Bisexuel': 235}
a = pd.Series(a.values(), index = a.keys())
a.plot(kind = 'pie', colors=('dodgerblue','#F08080','mediumpurple'))
plt.title('Gender Distribution of My Dataset', fontsize=25)
figure(figsize=(8,8))
<matplotlib.figure.Figure at 0x109a5cad0>
<matplotlib.figure.Figure at 0x109a5cad0>
Number of total profiles scraped:
Straight Male: 2,054
Straight Female: 2,412
Bisexual All Gender: 782
--User basic information: gender, age, location, orientation, ethnicities, height, bodytype, diet, smoking, drinking, drugs, religion, sign, education, job, income, status, monogamous, children, pets, languages
--User matching information: gender orientation, age range, location, single, purpose
--User self-description: summary, what they are currently doing, what they are good at, noticeable facts, favourite books/movies, things they can’t live without, how to spend time, friday activities, private thing, message preference
def getProfile(num, username):
result = {}
for num in range(num):
url = 'http://www.okcupid.com/profile/'+username[num]
test = requests.get(url, cookies = cookies)
if test.status_code == 200:
page = requests.get(url, cookies=cookies).text
soup = BeautifulSoup(page)
# user basic information
result.setdefault('username', [])
result.setdefault('gender', [])
result.setdefault('age', [])
result.setdefault('location', [])
result.setdefault('frequency', [])
result['username'].append(username[num])
result['gender'].append(soup.find_all('span',{'class':'ajax_gender'})[0].get_text())
result['age'].append(soup.find_all('span',{'id':'ajax_age'})[0].get_text())
result['location'].append(soup.find_all('span',{'id':'ajax_location'})[0].get_text())
result['frequency'].append(soup.find_all('div',{'class':'tooltip_text hidden'})[0].get_text())
basic = ['orientation','ethnicities','height','bodytype','diet','smoking',\
'drinking','drugs','religion','sign','education','job','income','status',\
'monogamous','children', 'pets','languages']
for i in basic:
result.setdefault(i, [])
x = soup.find_all('dd', {'id':'ajax_'+i})
if x == []:
result[i].append('')
else:
result[i].append(x[0].get_text())
# user matching information
find = ['gentation','ages','near','single','lookingfor']
for i in find:
result.setdefault(i, [])
x = soup.find_all('li', {'id':'ajax_'+i})
if x == []:
result[i].append('')
else:
result[i].append(x[0].get_text())
# user self description information
text = ['0','1','2','3','4','5','6','7','8','9']
for i in text:
result.setdefault(i, [])
x = soup.find_all('div', {'id':'essay_text_'+i})
if x == []:
result[i].append('')
else:
result[i].append(x[0].get_text())
print num, 'of', len(username), test.status_code == 200
return result
l =[]
with open('usernames.txt', 'r') as f:
for line in f:
l.append(line.rstrip('\n'))
s = set(l)
print len(l), 'of usernames have been added to the usernames list.'
print len(s), "of them are unique."
# rewrite the file if there were redundancies
if len(l) != len(s):
writeUsernames(s)
print 'usernames.txt file has been rewrriten.'
l = list(s)
# Set the number of usernames to scrape
result = getProfile(len(l), l)
profile = pd.DataFrame(result)
profile = profile.rename(columns = {'0':'0summary','1':'1doing','2':'2goodat','3':'3notice',\
'4':'4books','5':'5without','6':'6spendtime','7':'7friday','8':'8private','9':'9message'})
profile = profile.set_index(['username'])
print profile.columns
# Export the profiles to csv
profile.to_csv('profile.csv',encoding='utf-8')
!head -5 profile.csv
profile
p = pd.read_table('profiles(male).csv', sep=',')
p.groupby([p.gender]).size()
gender Man 2053 Man, Genderfluid, Genderqueer, Non-binary 1 dtype: int64
p2 = pd.read_table('profiles(female).csv', sep=',')
p2.groupby([p2.gender]).size()
gender Agender 1 Cis Woman 1 Genderfluid, Woman, Non-binary, Agender 1 Non-binary 1 Transfeminine, Woman, Transgender, Non-binary, Trans Woman 1 Transgender 1 Transgender, Transsexual, Woman 1 Woman 2405 dtype: int64
p3 = pd.read_table('profile.csv', sep=',')
p3.groupby([p3.gender]).size()
gender Agender 1 Agender, Genderfluid 1 Agender, Genderqueer 1 Androgynous 1 Androgynous, Non-binary 1 Androgynous, Woman 1 Cis Man 1 Cis Woman 5 Gender Nonconforming 2 Genderfluid 3 Genderfluid, Genderqueer 1 Genderfluid, Man 1 Genderqueer 4 Genderqueer, Gender Nonconforming, Non-binary 1 Genderqueer, Genderfluid 1 Man 146 Man, Agender 1 Man, Cis Man 2 Non-binary 6 Non-binary, Androgynous, Genderfluid, Genderqueer 1 Non-binary, Man 1 Non-binary, Transfeminine 1 Other 2 Trans Man, Transmasculine, Transgender 1 Trans Woman 5 Trans Woman, Transgender 1 Trans Woman, Transgender, Two Spirit 1 Transfeminine, Transgender, Trans Woman 1 Transgender, Genderfluid, Man 1 Transgender, Man, Genderqueer, Genderfluid 1 Transgender, Trans Woman 1 Woman 575 Woman, Cis Woman 3 Woman, Gender Nonconforming 1 Woman, Gender Nonconforming, Genderfluid 1 Woman, Genderqueer 1 Woman, Man 2 Woman, Transgender 1 Woman, Transsexual 1 dtype: int64
%pylab inline
plt.rcParams['figure.figsize'] = 12, 6
sns.distplot(p.age, color = "dodgerblue")
plt.xlabel('Age', fontsize = 20)
plt.ylabel('Density', fontsize = 20)
plt.xticks(np.arange(20,85,5), fontsize=20)
plt.yticks(fontsize=20)
plt.xlim(18,80)
plt.title('Straight Male Users Age Density', fontsize = 25)
plt.text(55, 0.035, '2056 users', fontsize = 20)
plt.text(55, 0.03, 'Average age: 44', fontsize = 20)
print '\n\n','The average age of', len(p), 'staright female users is', round(mean(p.age))
Populating the interactive namespace from numpy and matplotlib The average age of 2054 staright female users is 44.0
plt.rcParams['figure.figsize'] = 12, 6
sns.distplot(p2.age, color = '#FF4D4D')
plt.xlabel('Age', fontsize = 20)
plt.ylabel('Density', fontsize = 20)
plt.xticks(np.arange(20,85,5), fontsize=20)
plt.yticks(fontsize=20)
plt.xlim(18,60)
plt.title('Straight Female Users Age Density', fontsize = 25)
plt.text(45, 0.10, '2412 users', fontsize = 20)
plt.text(45, 0.08, 'Average age: 35', fontsize = 20)
print '\n\n','The average age of', len(p2), 'staright female users is', round(mean(p2.age))
The average age of 2412 staright female users is 35.0
plt.rcParams['figure.figsize'] = 12, 6
sns.distplot(p3.age, color = 'mediumpurple')
plt.xlabel('Age', fontsize = 20)
plt.ylabel('Density', fontsize = 20)
plt.xticks(np.arange(20,85,5), fontsize=20)
plt.yticks(fontsize=20)
plt.xlim(18,50)
plt.title('Bisexual Mixed Gender Users Age Density', fontsize = 25)
plt.text(37, 0.09, '782 users', fontsize = 20)
plt.text(37, 0.08, 'Average age: 26', fontsize = 20)
print '\n\n','The average age of', len(p3), 'staright female users is', round(mean(p3.age))
The average age of 782 staright female users is 26.0
def locationranks(df):
ranks = {}
for i in df.location:
x = re.split(', ', i)[-1]
ranks[x] = ranks.get(x, 0) + 1
ranks = pd.Series(ranks.values(), index = ranks.keys())
ranks = ranks.order(ascending=False)[:10]
return ranks
plt.rcParams['figure.figsize'] = 12,12
locationranks(p).plot(kind='bar', fontsize = 14, legend=False, color = 'dodgerblue')
plt.title('Top 10 Cities of Straight Male Users', fontsize=25)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 12,12
locationranks(p2).plot(kind = 'bar', fontsize = 14, legend=False)
plt.title('Top 10 Cities of Straight Female Users', fontsize=25)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
(array([ 0., 50., 100., 150., 200., 250., 300., 350., 400.]), <a list of 9 Text yticklabel objects>)
plt.rcParams['figure.figsize'] = 12,12
locationranks(p3).plot(kind = 'bar', fontsize = 14, legend=False, color = 'mediumpurple')
plt.title('Top 10 Cities of Mixed Gender Bisexual Users', fontsize=25)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
(array([ 0., 20., 40., 60., 80., 100., 120., 140.]), <a list of 8 Text yticklabel objects>)
young = []
old = []
for i in p.index:
y = int(re.findall('(\d\d)–', p.ages[i])[0])
o = int(re.findall('–(\d\d)', p.ages[i])[0])
young.append(y)
old.append(o)
young = pd.Series(young)
old = pd.Series(old)
agerange = pd.DataFrame(p.age, columns=['age'])
agerange['young'] = young
agerange['old'] = old
print len(agerange)
plt.rcParams['figure.figsize'] = 12, 6
plot = agerange.groupby(agerange.age).mean()
plt.plot(plot.index, plot.index, color='blue', label = 'His age')
plt.plot(plot.index, plot.young, color='dodgerblue', label = 'Her youngest age')
plt.plot(plot.index, plot.old, color='dodgerblue', label = 'Her oldest age')
plot.old[18] = 18
plot.young[99] = 99
plt.fill(plot.index, plot.old, color='dodgerblue', alpha = 0.3)
plt.fill(plot.index, plot.young, color='dodgerblue', alpha = 0.3)
plt.xlabel('His Age', fontsize = 20)
plt.ylabel('Her Age Range', fontsize = 20)
plt.xticks(np.arange(20,85,5), fontsize=20)
plt.yticks(fontsize=20)
plt.title('Age Preferences of Straight Men', fontsize = 25)
plt.xlim(18,60)
plt.legend(loc = 4)
2054
<matplotlib.legend.Legend at 0x10b479a90>
young = []
old = []
for i in p2.index:
y = int(re.findall('(\d\d)–', p2.ages[i])[0])
o = int(re.findall('–(\d\d)', p2.ages[i])[0])
young.append(y)
old.append(o)
young = pd.Series(young)
old = pd.Series(old)
agerange = pd.DataFrame(p2.age, columns=['age'])
agerange['young'] = young
agerange['old'] = old
print len(agerange)
plot = agerange.groupby(agerange.age).mean()
plot = plot.loc[plot.index <= 60,]
plt.rcParams['figure.figsize'] = 12, 6
plt.plot(plot.index, plot.index, color='red', label = 'Her age')
plt.plot(plot.index, plot.young, color='#F08080', label = 'His youngest age')
plt.plot(plot.index, plot.old, color='#F08080', label = 'His oldest age')
plot.loc[18] = 18
plot.young[60] = 60
plt.fill(plot.index, plot.old, color='#F08080', alpha = 0.3)
plt.fill(plot.index, plot.young, color='#F08080', alpha = 0.3)
plt.xlabel('Her Age', fontsize = 20)
plt.ylabel('His Age Range', fontsize = 20)
plt.xticks(np.arange(20,85,5), fontsize=20)
plt.yticks(fontsize=20)
plt.title('Age Preferences of Straight Women', fontsize = 25)
plt.xlim(18,60)
plt.legend(loc = 4)
2412
<matplotlib.legend.Legend at 0x10af51fd0>