import pandas as pd
import time
from math import ceil
import pickle
# load pickle of all words and decades and remove those that appear in more than 10 decades
df = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
origlen = len(df)
origwds = len(df.word.unique())
df = df[df.nonalpha == False] # remove words with nonalphanumeric characters
wordcount = pd.DataFrame(df.groupby('word').decade.count())
wordcount = wordcount[wordcount.decade <= 15]
df = df[df.word.isin(wordcount.index)]
df = df[['word', 'decade', 'pct']]
print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()),
len(df.word.unique())*100.0/origwds)
print df.head(10)
2539728 records reduced to 1214911 (47.8 %) 436103 words reduced to 289826 (66.5 %) word decade pct 102 aaa 1850 0.000006 103 aaa 1910 0.000009 104 aaa 1920 0.000008 105 aaa 1930 0.001382 106 aaa 1940 0.000170 107 aaa 1950 0.000110 108 aaa 1960 0.000035 109 aaa 1970 0.000052 110 aaa 1980 0.000070 111 aaa 1990 0.000319
#keep words in crossword dictionary, e.g. not proper nouns
origlen = len(df)
origwds = len(df.word.unique())
import json
xwords = json.loads(open('../data_user_pickle_csv/coha_and_xword.json', 'r').read())
df = df[df.word.isin(xwords)]
print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()),
len(df.word.unique())*100.0/origwds)
1214911 records reduced to 313863 (25.8 %) 289826 words reduced to 41068 (14.2 %)
# keep top 10000 words in terms of max and sum
origlen = len(df)
origwds = len(df.word.unique())
dfsum = pd.DataFrame(df.groupby('word').pct.sum())
dfsum.sort('pct', ascending=False, inplace=True)
dfsum = dfsum[:10000]
dfmax = pd.DataFrame(df.groupby('word').pct.max())
dfmax.sort('pct', ascending=False, inplace=True)
dfmax = dfmax[:10000]
df = df[(df.word.isin(dfsum.index)) | (df.word.isin(dfmax.index))]
print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()),
len(df.word.unique())*100.0/origwds)
313863 records reduced to 128688 (41.0 %) 41068 words reduced to 11740 (28.6 %)
# add sum per decade to dfsum
series_count = df.groupby('word').decade.count()
dfsum['pct_per_decade'] = 0.0
dfsum['decades'] = 0
dfsum['decade_specificity'] = 0.0
for i in range(len(dfsum)):
dfsum.pct_per_decade.iloc[i] = (dfsum.pct[i] /
series_count[dfsum.index[i]])
dfsum.decades[i] = series_count[dfsum.index[i]]
dfsum.decade_specificity[i] = 20 - series_count[dfsum.index[i]]
dfsum.sort('pct_per_decade', ascending=False, inplace=True)
print dfsum.head(50)
pct pct_per_decade decades decade_specificity word soviet 0.133485 0.011124 12 8 radio 0.102161 0.007859 13 7 phone 0.106324 0.007595 14 6 television 0.071525 0.006502 11 9 okay 0.063832 0.006383 10 10 telephone 0.092873 0.006192 15 5 movie 0.056559 0.005142 11 9 programs 0.062321 0.004155 15 5 nuclear 0.047605 0.003967 12 8 computer 0.045346 0.003779 12 8 cigarette 0.053111 0.003541 15 5 airport 0.031493 0.003499 9 11 automobile 0.038150 0.002935 13 7 photo 0.043835 0.002922 15 5 sutta 0.008483 0.002828 3 17 movies 0.030464 0.002769 11 9 baseball 0.037568 0.002505 15 5 shit 0.027538 0.002295 12 8 weekend 0.024362 0.002215 11 9 unemployment 0.026051 0.002171 12 8 concept 0.032165 0.002144 15 5 aircraft 0.027237 0.002095 13 7 scheduled 0.027750 0.001982 14 6 fucking 0.013840 0.001977 7 13 parking 0.023615 0.001968 12 8 golf 0.029198 0.001947 15 5 global 0.019391 0.001939 10 10 environmental 0.023204 0.001934 12 8 garage 0.022876 0.001906 12 8 brittles 0.001743 0.001743 1 19 soviets 0.016589 0.001659 10 10 computers 0.014902 0.001656 9 11 fizgig 0.001648 0.001648 1 19 almah 0.001642 0.001642 1 19 buddy 0.019513 0.001626 12 8 fuck 0.014576 0.001620 9 11 cloddy 0.009557 0.001593 6 14 gasoline 0.022212 0.001587 14 6 output 0.023681 0.001579 15 5 electronic 0.015746 0.001575 10 10 racial 0.023483 0.001566 15 5 airplane 0.017098 0.001554 11 9 nazi 0.018378 0.001532 12 8 regional 0.021399 0.001528 14 6 airlines 0.013531 0.001503 9 11 skills 0.022295 0.001486 15 5 basketball 0.017710 0.001476 12 8 techniques 0.018977 0.001460 13 7 taxi 0.018859 0.001451 13 7 video 0.017279 0.001440 12 8
# for contrast, let's see proper nouns
df_proper = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
df_proper = df_proper[df_proper.nonalpha == False] # remove words with nonalphanumeric characters
wordcount = pd.DataFrame(df_proper.groupby('word').decade.count())
wordcount = wordcount[wordcount.decade <= 15]
df_proper = df_proper[df_proper.word.isin(wordcount.index)]
df_proper = df_proper[['word', 'decade', 'pct']]
df_propersum = pd.DataFrame(df_proper.groupby('word').pct.sum())
df_propersum.sort('pct', ascending=False, inplace=True)
df_propersum = df_propersum[:10000]
df_propermax = pd.DataFrame(df_proper.groupby('word').pct.max())
df_propermax.sort('pct', ascending=False, inplace=True)
df_propermax = df_propermax[:10000]
df_proper = df_proper[(df_proper.word.isin(df_propersum.index)) | (df_proper.word.isin(df_propermax.index))]
proper_series_count = df_proper.groupby('word').decade.count()
df_propersum['pct_per_decade'] = 0.0
for i in range(len(df_propersum)):
df_propersum.pct_per_decade.iloc[i] = (df_propersum.pct[i] /
proper_series_count[df_propersum.index[i]])
df_propersum.sort('pct_per_decade', ascending=False, inplace=True)
df_propersum = df_propersum[~df_propersum.index.isin(dfsum.index)]
print df_propersum.head(50)
df_propersum[:50].to_csv('coha_top_omitted_proper_nouns.csv')
pct pct_per_decade word dorriville 0.033207 0.033207 altorf 0.042033 0.021016 madiboo 0.018765 0.018765 selico 0.018074 0.018074 pacomo 0.016863 0.016863 pufpace 0.016171 0.016171 brazzo 0.015393 0.015393 lescourt 0.013923 0.013923 rossberg 0.027246 0.013623 rheinthal 0.011415 0.011415 plotwell 0.011242 0.011242 fourbin 0.010983 0.010983 immorina 0.010983 0.010983 ridolpho 0.010810 0.010810 bertocci 0.010118 0.010118 demba 0.010118 0.010118 torribal 0.009858 0.009858 devalmore 0.009512 0.009512 erlach 0.037984 0.009496 lesc 0.009426 0.009426 ploughby 0.009253 0.009253 eberard 0.018079 0.009040 makesafe 0.008994 0.008994 ksenia 0.008994 0.008994 joblin 0.017905 0.008952 mentzikoff 0.008648 0.008648 usaldo 0.008561 0.008561 ubal 0.008475 0.008475 almeyda 0.016175 0.008088 hippolito 0.015397 0.007699 barogo 0.007610 0.007610 beraldo 0.015061 0.007531 hardrun 0.007523 0.007523 arandez 0.007351 0.007351 maillac 0.007091 0.007091 mahadi 0.007091 0.007091 bloomville 0.028237 0.007059 spendall 0.007005 0.007005 lanissa 0.006659 0.006659 spicket 0.006287 0.006287 ridol 0.006226 0.006226 shenac 0.006137 0.006137 rainouard 0.006053 0.006053 flaurence 0.005967 0.005967 wildenhain 0.017735 0.005912 cerval 0.011765 0.005883 oresca 0.005794 0.005794 quicksite 0.005448 0.005448 darina 0.005362 0.005362 chetwynde 0.005292 0.005292
# make pivot table showing in which decades words occurred
decades = range(1810, 2010, 10)
dftop = dfsum[:50]
dftoplookup = df.copy()
for decade in decades:
dftop[decade] = 0.0
for i in range(len(dftop)):
for decade in decades:
if len(dftoplookup[(dftoplookup.word == dftop.index[i]) &
(dftoplookup.decade == decade)]) > 0:
dftop[decade].iloc[i] = dftoplookup[(dftoplookup.word == dftop.index[i]) &
(dftoplookup.decade == decade)].pct.iloc[0]
print dftop.head()
pct pct_per_decade decades decade_specificity 1810 1820 \ word soviet 0.133485 0.011124 12 8 0 0 radio 0.102161 0.007859 13 7 0 0 phone 0.106324 0.007595 14 6 0 0 television 0.071525 0.006502 11 9 0 0 okay 0.063832 0.006383 10 10 0 0 1830 1840 1850 1860 ... 1910 1920 1930 \ word ... soviet 0 0 0 0 ... 0.000208 0.004427 0.006941 radio 0 0 0 0 ... 0.000253 0.005014 0.012146 phone 0 0 0 0 ... 0.001941 0.001890 0.005214 television 0 0 0 0 ... 0.000000 0.000499 0.000588 okay 0 0 0 0 ... 0.000000 0.000008 0.001353 1940 1950 1960 1970 1980 1990 \ word soviet 0.011937 0.030687 0.022103 0.019477 0.027580 0.007039 radio 0.017642 0.014509 0.012027 0.009458 0.010498 0.010467 phone 0.006885 0.008053 0.012009 0.013640 0.014318 0.018203 television 0.001981 0.008791 0.011224 0.014161 0.014006 0.010957 okay 0.003058 0.004406 0.006313 0.008459 0.009622 0.014044 2000 word soviet 0.003077 radio 0.010118 phone 0.023380 television 0.009295 okay 0.016564 [5 rows x 24 columns]
dftop.to_csv('coha_top_decades.csv')