Eamonn Bell, Columbia University <epb2125@columbia.edu>
Work in progress. Some interesting questions worth asking are bolded if there's anything in the notebook that moves towards a solution.
Thanks to https://github.com/bmcfee for the parsing code. The dataset this notebook is based on was released under CC0 1.0 Universal.
import lxml
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import folium
import collections
import glob
from pprint import pprint
from IPython.display import HTML, Image
from lxml import etree, objectify
%matplotlib inline
!git clone https://github.com/nyphilarchive/PerformanceHistory.git
Cloning into 'PerformanceHistory'... remote: Counting objects: 350, done. remote: Total 350 (delta 0), reused 0 (delta 0), pack-reused 350 Receiving objects: 100% (350/350), 12.53 MiB | 7.66 MiB/s, done. Resolving deltas: 100% (253/253), done. Checking connectivity... done.
# Author: https://github.com/bmcfee/nycphil (Brian McFee)
def parse_programs(programs):
return [parse_program(x) for x in programs]
def parse_program(program):
dispatch = dict(concertInfo=parse_concertInfo,
worksInfo=parse_worksInfo)
data = dict()
for child in program.getchildren():
if child.tag in dispatch:
data[child.tag] = dispatch[child.tag](child)
else:
data[child.tag] = child.text
return data
def parse_concertInfo(concertInfo):
data = dict()
for child in concertInfo.getchildren():
data[child.tag] = child.text
return data
def parse_worksInfo(worksInfo):
data = list()
for child in worksInfo.getchildren():
data.append(parse_work(child))
return data
def parse_work(work):
dispatch = dict(soloists=parse_soloists)
data = dict()
for child in work.getchildren():
if child.tag in dispatch:
data[child.tag] = dispatch[child.tag](child)
else:
data[child.tag] = child.text
return data
def parse_soloists(soloists):
data = list()
for child in soloists.getchildren():
data.append(parse_soloist(child))
return data
def parse_soloist(soloist):
data = dict()
for child in soloist.getchildren():
data[child.tag] = child.text
return data
def flatten(d):
works = d.pop('worksInfo', [])
concertInfo = d.pop('concertInfo', [])
out = []
for w in works:
out.append(concertInfo.copy())
# Added this to get soloist's names in. Dirty.
soloists = w.get('soloists', None)
if soloists is not None:
soloists_names = [s.get('soloistName') for s in soloists if s.get('soloistName') is not None]
soloists_tsv = "\t".join(soloists_names)
out[-1].update({'soloists_tsv' : soloists_tsv})
w.pop('soloists', [])
out[-1].update(d)
out[-1].update(w)
return out
def load_programs():
# We need this to handle badly formatted &'s in strings
parser = etree.XMLParser(recover=True)
fd = []
globbed = sorted(glob.glob('./PerformanceHistory/Programs/*.xml'))
for xmlfile in globbed:
obj = objectify.parse(xmlfile, parser=parser)
dix = parse_programs(obj.getroot())
for _ in dix:
if _['programID'] == '11451':
print _['programID']
fd.extend(flatten(_))
df = pd.DataFrame.from_records(fd)
df['oldDate'] = df['Date']
df['Date'] = pd.to_datetime(df['Date'])
del df['worksInfo']
del df['work']
del df['concertInfo']
return df
df = load_programs()
df.head()
Date | Location | Time | Venue | composerName | conductorName | eventType | id | interval | orchestra | program | programID | season | soloists_tsv | workTitle | oldDate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1842-12-07 05:00:00 | Manhattan, NY | 8:00PM | Apollo Rooms | Beethoven, Ludwig van | Hill, Ureli Corelli | Subscription Season | 38e072a7-8fc9-4f9a-8eac-3957905c0002 | NaN | New York Philharmonic | NaN | 3853 | 1842-43 | NaN | SYMPHONY NO. 5 IN C MINOR, OP.67 | 1842-12-07T05:00:00Z |
1 | 1842-12-07 05:00:00 | Manhattan, NY | 8:00PM | Apollo Rooms | Weber, Carl Maria Von | Timm, Henry C. | Subscription Season | 38e072a7-8fc9-4f9a-8eac-3957905c0002 | NaN | New York Philharmonic | NaN | 3853 | 1842-43 | Otto, Antoinette | OBERON | 1842-12-07T05:00:00Z |
2 | 1842-12-07 05:00:00 | Manhattan, NY | 8:00PM | Apollo Rooms | Hummel, Johann | NaN | Subscription Season | 38e072a7-8fc9-4f9a-8eac-3957905c0002 | NaN | New York Philharmonic | NaN | 3853 | 1842-43 | Scharfenberg, William\tHill, Ureli Corelli\tDe... | QUINTET, PIANO, D MINOR, OP. 74 | 1842-12-07T05:00:00Z |
3 | 1842-12-07 05:00:00 | Manhattan, NY | 8:00PM | Apollo Rooms | NaN | NaN | Subscription Season | 38e072a7-8fc9-4f9a-8eac-3957905c0002 | Intermission | New York Philharmonic | NaN | 3853 | 1842-43 | NaN | NaN | 1842-12-07T05:00:00Z |
4 | 1842-12-07 05:00:00 | Manhattan, NY | 8:00PM | Apollo Rooms | Weber, Carl Maria Von | Etienne, Denis G. | Subscription Season | 38e072a7-8fc9-4f9a-8eac-3957905c0002 | NaN | New York Philharmonic | NaN | 3853 | 1842-43 | NaN | OBERON | 1842-12-07T05:00:00Z |
# Get, e.g., top 5 composers by performances of all time
sample_list = list(df.composerName.value_counts()[1:5].index)
sample = df[df.composerName.isin(sample_list)]
all_works = df.groupby(df['Date'].map(lambda x:x.year)).count()
yearly_counts = pd.Series(all_works['id'], index=all_works.index)
yearly_counts.describe()
count 157.000000 mean 558.694268 std 567.675746 min 18.000000 25% 130.000000 50% 430.000000 75% 646.000000 max 2484.000000 Name: id, dtype: float64
yearly_counts.plot()
plt.title('# works performed in the NY Phil Program Archives')
<matplotlib.text.Text at 0x110fd0350>
all_programs = df.groupby(df['Date'].map(lambda x:x.year)).programID.nunique()
all_programs.plot()
<matplotlib.axes.AxesSubplot at 0x110f0bb50>
What's the deal with 1956? There's a bunch of phoney data somewhere. That peak of 900 or so should be distributed over the decade.
all_programs.ix[1950:1970].plot()
<matplotlib.axes.AxesSubplot at 0x110e5dad0>
df.Year = df.Date.map(lambda x:x.year)
df.dtypes
Date datetime64[ns] Location object Time object Venue object composerName object conductorName object eventType object id object interval object orchestra object program object programID object season object soloists_tsv object workTitle object oldDate object dtype: object
# See if the issue is with the datetime parsing from before. Seems like it's not.
df['Sanity'] = (df.Date.map(lambda x:str(x)) == df.oldDate.map(lambda x:str(x).replace('Z', '').replace('T', ' ')))
df.Sanity.value_counts()
True 87715 dtype: int64
del df['Sanity']
# Now, check if the application of x.year is OK
# First do a kludgy year parse on the string rep of the datetime, i.e. oldDate
oldDate_example = '1842-12-07T05:00:00Z'
def year_from_oldDate(oldDate):
return int(oldDate.split('-')[0])
assert year_from_oldDate('1842-12-07T05:00:00Z') == 1842
df['oldYear'] = df['oldDate'].map(year_from_oldDate)
# Same spike!
all_programs = df.groupby(df['oldYear']).programID.nunique()
all_programs.plot()
<matplotlib.axes.AxesSubplot at 0x110519bd0>
Here's one I know is messed up. See the season field.
df[df.programID == '11451']
Date | Location | Time | Venue | composerName | conductorName | eventType | id | interval | orchestra | program | programID | season | soloists_tsv | workTitle | oldDate | oldYear | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
40633 | 1956-07-12 04:00:00 | Manhattan, NY | 8:30PM | Lewisohn Stadium | NaN | NaN | Stadium Concert | c7f0aeae-8de8-4f42-a040-f5a7e624311b | NaN | Stadium-NY Philharmonic | NaN | 11451 | 1962-63 | NaN | NaN | 1956-07-12T04:00:00Z | 1956 |
Here's the salient part of the original .xml. I've no idea what the hell is going on.
!cat './PerformanceHistory/Programs/1955-56_TO_1962-63.xml' | grep -C 10 '11451'
</work> <work> <composerName>Falla, Manuel de</composerName> <workTitle>THREE-CORNERED HAT (EL SOMBRERO DE TRES PICOS), SUITE NO. 2</workTitle> <conductorName>Rudel, Julius</conductorName> </work> </worksInfo> </program> <program> <id>c7f0aeae-8de8-4f42-a040-f5a7e624311b</id> <programID>11451</programID> <orchestra>Stadium-NY Philharmonic</orchestra> <season>1962-63</season> <concertInfo> <eventType>Stadium Concert</eventType> <Location>Manhattan, NY</Location> <Venue>Lewisohn Stadium</Venue> <Date>1963-07-25T04:00:00Z</Date> <Time>8:30PM</Time> </concertInfo> <worksInfo>
for composer in sample_list:
one_composer = df[df.composerName == composer]
aggregate = one_composer.groupby(one_composer['Date'].map(lambda x:x.year)).count()
composer_counts = pd.Series(aggregate['id'], index=aggregate.index, name=composer)
composer_counts.plot(legend=True, label=composer, alpha=0.7)
plt.ylabel('Number of works performed that year')
plt.xlabel('Year of performance')
<matplotlib.text.Text at 0x10bf6b390>
for name in sample_list:
one_composer = df[df.composerName == name]
aggregate = one_composer.groupby(one_composer['Date'].map(lambda x:x.year)).count()
composer_counts = pd.Series(aggregate['id'], index=aggregate.index, name=name)
composer_counts_prop = composer_counts.divide(yearly_counts) * 100
composer_counts_prop.plot(legend=True, label=name)
plt.ylabel('% of works performed that year')
plt.xlabel('Year of performance')
<matplotlib.text.Text at 0x109fcbad0>
df.composerName.value_counts()[50:60]
Rubinstein, Anton 162 Ives, Charles 157 Goldmark, Karl 156 Dukas, Paul 154 Respighi, Ottorino 145 Massenet, Jules 142 Barber, Samuel 142 Hadley, Henry Kimball 141 Borodin, Alexander 137 Chabrier, Emmanuel 130 dtype: int64
Who on earth is Hadley, Henry Kimball
def composer_counts_by_name(name):
composer = df[df.composerName == name]
aggregate = composer.groupby(composer['Date'].map(lambda x:x.year)).count()
annual_composer_counts = pd.Series(aggregate['id'], index=aggregate.index, name=name)
return annual_composer_counts
def plot_composer_by_name(name):
composer_counts_by_name(name).plot(legend=True, label=name)
plot_composer_by_name('Milhaud, Darius')
plot_composer_by_name('Gould, Morton')
plot_composer_by_name('Ravel, Maurice')
plot_composer_by_name('Hadley, Henry Kimball')
hadley = df[df.composerName == 'Hadley, Henry Kimball']
hadley.groupby([df.workTitle], sort=True).count()['id'].order(ascending=False).head(10)
workTitle CULPRIT FAY RHAPSODY, OP. 62 60 SALOME, OP. 55 18 IN BOHEMIA 10 OCEAN, THE, OP. 99 8 CHINESE SKETCHES - STREETS OF PEKIN 6 SYMPHONY NO. 3, B MINOR, OP. 60 5 SILHOUETTES, OP.77 (ARR. Roberts) 5 SYMPHONY NO. 4, D MINOR, OP. 64, "NORTH, EAST, SOUTH, WEST" 5 LUCIFER, OP. 66 4 SYMPHONY NO. 2, F MINOR, OP.30 (FOUR SEASONS) 3 Name: id, dtype: int64
soloists = df.soloists_tsv[df.soloists_tsv.notnull()]
soloist_list = list(soloists)
len([s for s in soloists if ('\t' in s) and (';' in s)])
0
tab_separated = [t.split('\t') for t in [s for s in soloists if ('\t') in s]]
semicolon_separated = [t.split(';') for t in [s for s in soloists if (';') in s]]
soloists_split = tab_separated + semicolon_separated
len(soloists_split)
5250
from itertools import combinations
played_with_pairs = []
for collection in soloists_split:
for pair in combinations(collection, 2):
played_with_pairs.append(pair)
from collections import Counter
cnt = Counter(played_with_pairs)
top_ten_thou = cnt.most_common(n=10000)
with open('edges.txt', 'w') as f:
f.write('source;target;weight\n')
for edge, weight in top_ten_thou:
try:
f.write("{}".format(";".join(edge)) + ";{}\n".format(weight))
except:
continue
!wc edges.txt
9783 32217 333024 edges.txt
!head edges.txt
source;target;weight Smith, Philip;Alessi, Joseph;209 Smith, Philip;Myers, Philip;179 Myers, Philip;Alessi, Joseph;163 Smith, Philip;Sullivan, Robert;162 Sullivan, Robert;Alessi, Joseph;161 Alessi, Joseph;Deck, Warren;142 Smith, Philip;Deck, Warren;141 Sullivan, Robert;Myers, Philip;140 Myers, Philip;Deck, Warren;130
Process this with something like networkx
or Gephi to get something like this:
[pending]
df.Location.value_counts().head(10)
Manhattan, NY 48285 Philadelphia, PA 1217 Brooklyn, NY 1096 Washington, DC 441 Greenvale, NY 277 Baltimore, MD 272 Bronx, NY 202 Princeton, NJ 183 Newark, NJ 174 Pittsburgh, PA 168 dtype: int64
def get_state(location_str):
splitted = location_str.split(', ')
if len(splitted) != 2:
return None
elif len(splitted[1]) == 2:
return splitted[1].strip()
else:
return None
def test_get_state():
assert get_state('Manhattan, NY') == 'NY'
assert get_state('Dublin, IRELAND') is None
assert get_state('foobar,,') is None
return True
def run_tests():
assert test_get_state()
return True
assert run_tests()
df['State'] = df.Location.apply(get_state)
in_usa = df[df.State.notnull()]
out_of_state = df[df.State != 'NY']
decade_state = out_of_state.groupby([(out_of_state.Date.apply(lambda x: x.year)//10)*10,
out_of_state.State]).count()
decade_state.head(10)
Date | Location | Time | Venue | composerName | conductorName | eventType | id | interval | orchestra | program | programID | season | soloists_tsv | workTitle | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | State | |||||||||||||||
1890 | MA | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 0 | 5 | 0 | 5 | 5 | 1 | 5 |
1900 | DC | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0 | 2 | 0 | 2 | 2 | 0 | 2 |
IA | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | |
IL | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 0 | 8 | 0 | 8 | 8 | 2 | 8 | |
IN | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0 | 2 | 0 | 2 | 2 | 0 | 2 | |
KS | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | |
KY | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0 | 2 | 0 | 2 | 2 | 0 | 2 | |
MI | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 0 | 3 | 3 | 0 | 3 | |
MN | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 0 | 4 | 0 | 4 | 4 | 0 | 4 | |
MO | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 0 | 3 | 3 | 2 | 3 |
nineties = decade_state.loc[1990]
# Move index to column for use in plotting package later
nineties.reset_index(level=0, inplace=True)
twenties = decade_state.loc[1920]
twenties.reset_index(level=0, inplace=True)
Use folium
for chloropleth visualization
!wget https://raw.githubusercontent.com/python-visualization/folium/master/examples/us-states.json
def inline_map(m, width=650, height=500):
"""Takes a folium instance and embed HTML."""
m._build_map()
srcdoc = m.HTML.replace('"', '"')
embed = HTML('<iframe srcdoc="{}" '
'style="width: {}px; height: {}px; '
'border: none"></iframe>'.format(srcdoc, width, height))
return embed
def state_concert_counts(state_data):
state_geo = r'us-states.json'
f = folium.Map(location=[48, -102], zoom_start=3, max_zoom=4, min_zoom=3)
f.geo_json(geo_path=state_geo, data=state_data,
data_out='data.json',
columns=['State', 'programID'],
key_on='feature.id',
fill_color='YlGn', fill_opacity=0.7, line_opacity=0.2,
legend_name='Concerts played')
return inline_map(f)
state_concert_counts(nineties)
state_concert_counts(twenties)
For those of you following along with e.g. GitHub/ipynbviewer
, this won't render because it depends on some hosted .json files. Screenshots below:
Image('1920s.png')