Higher Education R&D dataset

In [1]:
# Importing the libraries that will be useful here
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.spatial.distance import pdist
import numpy as np
import os
%matplotlib inline
In [2]:
# reading in the data (I saved it to my local file as CSV)
ed = pd.read_csv('/Users/austinbrian/dev/blog/datasets/HERD2015_RandD_by_sector.csv')
In [56]:
# A quick look at the top of the dataset, to make sure everything came in OK
# I cheated a little here and went back to the CSV and eliminated commas, as it was a little easier to do that way
ed.head(5)
Out[56]:
Rank Institution Environmental sciences Life sciences Math and computer sciences Physical sciences Psychology Social sciences Sciences, nec Engineering All non-S&E fields
0 1 Johns Hopkins U. 31854 867715 171205 167009 3663 11034 54640 991937 6622
1 32 U. Illinois, Urbana-Champaign 7214 220029 114512 67182 17276 21340 5000 161458 25806
2 24 Georgia Institute of Technology 19068 19879 113353 47279 7431 9132 7645 533329 8254
3 89 Carnegie Mellon U. 348 11212 109026 14162 7757 6791 3479 89054 175
4 28 U. Southern California 20051 411987 93765 16924 9935 27941 327 69527 40574
In [57]:
ed[ed.Rank==2]
Out[57]:
Rank Institution Environmental sciences Life sciences Math and computer sciences Physical sciences Psychology Social sciences Sciences, nec Engineering All non-S&E fields
24 2 U. Michigan, Ann Arbor 14609 779922 25434 52449 21989 149805 1627 254505 68938
In [4]:
# This function looks at the data to make sure I don't have any missing variables
def eda(dataframe):
    print "Dataframe Shape", dataframe.shape
    print ""
    table = pd.DataFrame({"Missing": dataframe.isnull().sum(),
                          "Types": dataframe.dtypes,
                         "Uniques": [dataframe[i].nunique() for i in dataframe]})
    print table
    print ""
    print "Describe Dataframe"
    print dataframe.describe(include='all')
In [5]:
# After you make a function, you have to actually run it
eda(ed)
Dataframe Shape (640, 11)

                            Missing   Types  Uniques
Rank                              0   int64      635
Institution                       0  object      640
Environmental sciences            0   int64      408
Life sciences                     0   int64      591
Math and computer sciences        0   int64      414
Physical sciences                 0   int64      476
Psychology                        0   int64      360
Social sciences                   0   int64      397
Sciences, nec                     0   int64      233
Engineering                       0   int64      372
All non-S&E fields                0   int64      501

Describe Dataframe
              Rank                      Institution  Environmental sciences  \
count   640.000000                              640              640.000000   
unique         NaN                              640                     NaN   
top            NaN  Louisiana State U., Baton Rouge                     NaN   
freq           NaN                                1                     NaN   
mean    320.492188                              NaN             5076.668750   
std     184.885804                              NaN            16247.201522   
min       1.000000                              NaN                0.000000   
25%     160.750000                              NaN                0.000000   
50%     320.500000                              NaN              198.500000   
75%     480.250000                              NaN             1997.000000   
max     640.000000                              NaN           169678.000000   

        Life sciences  Math and computer sciences  Physical sciences  \
count    6.400000e+02                  640.000000         640.000000   
unique            NaN                         NaN                NaN   
top               NaN                         NaN                NaN   
freq              NaN                         NaN                NaN   
mean     6.065628e+04                 4070.756250        7361.401563   
std      1.483988e+05                13446.323577       20277.317083   
min      0.000000e+00                    0.000000           0.000000   
25%      4.587500e+02                    4.000000          30.750000   
50%      2.295000e+03                  208.500000         530.500000   
75%      2.515100e+04                 1818.000000        4464.250000   
max      1.075635e+06               171205.000000      169149.000000   

          Psychology  Social sciences  Sciences, nec    Engineering  \
count     640.000000       640.000000     640.000000     640.000000   
unique           NaN              NaN            NaN            NaN   
top              NaN              NaN            NaN            NaN   
freq             NaN              NaN            NaN            NaN   
mean     1850.025000      3627.956250    1690.635937   17297.278125   
std      4576.223637     10928.929547    7561.262366   59004.695924   
min         0.000000         0.000000       0.000000       0.000000   
25%         0.000000         2.000000       0.000000       0.000000   
50%        74.500000       168.500000       0.000000     208.500000   
75%      1208.500000      1875.250000     262.250000    9678.250000   
max     46707.000000    149805.000000  123658.000000  991937.000000   

        All non-S&E fields  
count           640.000000  
unique                 NaN  
top                    NaN  
freq                   NaN  
mean           5662.442187  
std           12943.654068  
min               0.000000  
25%             111.000000  
50%             892.500000  
75%            4414.750000  
max          130711.000000  
In [6]:
X_cols = ed.columns[2:]

Plot some distributions

In [7]:
plt.style.use('fivethirtyeight')

This function plots two dimensions by one another, and colorizes the points based on the overall school R&D value rank, with higher ranks being darker greens.

In [8]:
def plot_ed(var_X1,var_X2,c=ed.Rank,co="Greens_r"):
    plt.figure(figsize=(8,6))
    plt.scatter(ed[var_X1],ed[var_X2],c=c,cmap=co,alpha=.8)
    plt.title(var_X1+' vs '+var_X2+' R&D Research $',fontsize=18)
    plt.xlabel(var_X1,fontsize = 14)
    plt.ylabel(var_X2,fontsize = 14)
In [9]:
plot_ed('Social sciences','Math and computer sciences')
In [10]:
plot_ed('Engineering','Social sciences')
In [11]:
plot_ed('Environmental sciences','Engineering')
In [12]:
plot_ed('Psychology','Life sciences')

This one's interesting! It's a bit of a closer relationship.

In [13]:
# Let's just plot them all.
sns.pairplot(ed[X_cols])
Out[13]:
<seaborn.axisgrid.PairGrid at 0x116f85c50>

Interesting, but not super conclusive on anything.

Cluster Analysis

I'm going to use both a DBSCAN clustering algorithm and a hierarchical algorithm to group schools. As the plots here show, there aren't a lot of clusters going on for any pairs, so I don't necessarily expect to get a whole lot.

The reason for looking at a cluster analysis is that we don't necessarily have a natural classification scheme for these schools. There isn't an obvious "target" to identify them as a type, so we want to find similarities within the dataset we have.

In [14]:
# more libraries
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, linear_model, metrics

DBSCAN

In [15]:
# DBSCAN
X = ed[X_cols].values
y = ed['Rank']
dbscn = DBSCAN(eps = 1000, min_samples = 4).fit(X)  # played with epsilon and min samples
labels = dbscn.labels_  
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
cluster_groups = {}
for i in set(labels):
    cluster_groups[i]=0
    for j in labels:
        if j==i:
            cluster_groups[i]+=1
        else:
            pass
cluster_groups
Out[15]:
{-1: 387, 0: 245, 1: 4, 2: 4}
In [16]:
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels))  # 1 is best
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) # 1 is best
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) # Harmonic mean
print("Silhouette Coefficient: %0.3f" # higher is better
      % metrics.silhouette_score(X, labels))
Estimated number of clusters: 3
Homogeneity: 0.114
Completeness: 1.000
V-measure: 0.205
Silhouette Coefficient: -0.074

This is a bad score. Our data isn't really very "dense" so a density algorithm isn't going to give us a lot of value-add here.

Let's look at it anyway.

In [17]:
# This is just a quick reminder of the order of our variables
X_cols
Out[17]:
Index([u'Environmental sciences', u'Life sciences',
       u'Math and computer sciences', u'Physical sciences', u'Psychology',
       u'Social sciences', u'Sciences, nec', u'Engineering',
       u'All non-S&E fields'],
      dtype='object')
In [18]:
plt.figure(figsize=(8,6))
unique_labels = np.unique(labels)
colors = plt.cm.Spectral(np.linspace(0,1, len(unique_labels)))

for (label, color) in zip(unique_labels, colors):
    class_member_mask = (labels == label)
    n = X[class_member_mask] 
    # Psychology and Life Sciences Plot again
    plt.plot(n[:,4],n[:,1], 'o', markerfacecolor = color, markersize = 8,alpha=.3)
/Users/austinbrian/anaconda/lib/python2.7/site-packages/matplotlib/lines.py:1206: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  if self._markerfacecolor != fc:

Yikes. All of our clusters but one are right around the origin - and even the lighter pink color and darker pink color aren't distinguishing features because I put a transparency setting so that I could see when the points overlapped.

Hierarchical clustering

In [19]:
# libraries
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
from scipy.spatial.distance import pdist
In [20]:
# We'll implement the actual clustering algorithm using the ward method:
Z = linkage(X, 'ward')

# We can calculate the cophenetic correlation coefficient to see how well our algorithm has measured the distances between the points:
c, coph_dists = cophenet(Z, pdist(X))
c
Out[20]:
0.85448874350133697

Not bad!

In [21]:
# Dendrogram demonstrates hierarchy
plt.title('Dendrogram')
plt.xlabel('Index Numbers')
plt.ylabel('Distance')
dendrogram(
    Z,
    leaf_rotation=90.,
    leaf_font_size=8.,)
plt.show()

It looks like there is some good separation of clusters right about the 1,000,000 point, so it's a good place to set a max value.

In [22]:
# we can see that no links exist above a distance of 200 - so we will set maximum distance at 200 and use the fclusters function from scipy.cluster.hierarchy, which will return our cluster ID's.
max_dist = 1000000
clusters = fcluster(Z, max_dist, criterion='distance')
clusters

# Let's plot our data and assign the class labels as the color:
plt.figure(figsize=(8,6))
plt.scatter(X[:,4], X[:,1], c=clusters, cmap='prism')
plt.show()

Very cool! There's some identifiable separation here. We can plot more maps this same way, but first let's clean up this one.

In [23]:
plot_ed('Psychology','Life sciences',c=clusters, co='prism')
In [24]:
plot_ed('Environmental sciences','Life sciences',c=clusters, co='prism')
In [25]:
plot_ed('Math and computer sciences','Physical sciences',c=clusters, co='prism')

Matching schools back to clusters

These seem like useful clusters, but it would be more useful if we could show which institutions were in which cluster.

In [34]:
for i in clusters:
    clust_zip = zip(ed.Institution, clusters)
In [35]:
clust_zip
Out[35]:
[('Johns Hopkins U.', 6),
 ('U. Illinois, Urbana-Champaign', 4),
 ('Georgia Institute of Technology', 4),
 ('Carnegie Mellon U.', 2),
 ('U. Southern California', 3),
 ('Massachusetts Institute of Technology', 4),
 ('U. Texas, Austin', 2),
 ('Pennsylvania State U., University Park and Hershey Medical Center', 4),
 ('U. Maryland, College Park', 2),
 ('U. California, San Diego', 5),
 ('Brown U.', 2),
 ('U. Utah', 3),
 ('North Carolina State U.', 4),
 ('U. Minnesota, Twin Cities', 5),
 ('Indiana U., Bloomington', 3),
 ('U. Wisconsin-Madison', 5),
 ('U. Chicago', 3),
 ('Ohio State U.', 5),
 ('Stanford U.', 5),
 ('U. California, Los Angeles', 5),
 ('Virginia Polytechnic Institute and State U.', 4),
 ('Purdue U., West Lafayette', 4),
 ('Columbia U. in the City of New York', 5),
 ('Rutgers, State U. New Jersey, New Brunswick', 3),
 ('U. Michigan, Ann Arbor', 5),
 ('U. North Carolina, Chapel Hill', 5),
 ('U. Alabama, Huntsville', 1),
 ('U. Washington, Seattle', 5),
 ('New York U.', 3),
 ('U. Central Florida', 2),
 ('U. Tennessee, Knoxville', 2),
 ('Cornell U.', 5),
 ('U. Hawaii, Manoa', 2),
 ('U. Massachusetts, Amherst', 2),
 ('Texas A&M U., College Station and Health Science Center', 4),
 ('Michigan State U.', 4),
 ('Arizona State U.', 2),
 ('Princeton U.', 2),
 ('Iowa State U.', 2),
 ('SUNY, Stony Brook U.', 2),
 ('Duke U.', 5),
 ('U. Texas M. D. Anderson Cancer Center', 5),
 ('U. California, Irvine', 2),
 ('U. Pennsylvania', 5),
 ('Rice U.', 2),
 ('Harvard U.', 5),
 ('U. California, Berkeley', 4),
 ('George Mason U.', 1),
 ('U. Illinois, Chicago', 3),
 ('U. Alabama, Tuscaloosa', 1),
 ('SUNY, U. Buffalo', 3),
 ('Florida International U.', 2),
 ('U. Nebraska, Lincoln', 2),
 ('Rensselaer Polytechnic Institute', 2),
 ('U. Notre Dame', 2),
 ('U. California, Santa Barbara', 2),
 ('U. California, Davis', 5),
 ('Yale U.', 5),
 ('Mississippi State U.', 2),
 ('Northeastern U.', 1),
 ('Florida State U.', 2),
 ('U. Arizona', 4),
 ('U. Georgia', 3),
 ('New Jersey Institute of Technology', 2),
 ('California Institute of Technology', 2),
 ('U. Louisiana, Lafayette', 1),
 ('U. Colorado Boulder', 2),
 ('U. Florida', 5),
 ('U. Pittsburgh, Pittsburgh', 5),
 ('Oregon State U.', 2),
 ('Rockefeller U.', 3),
 ('U. Texas, Dallas', 1),
 ('U. Maryland, Baltimore County', 1),
 ('U. South Florida, Tampa', 3),
 ('Boston U.', 3),
 ('Clemson U.', 2),
 ('U. Virginia, Charlottesville', 3),
 ('Naval Postgraduate School', 1),
 ('U. Houston', 2),
 ('George Washington U.', 2),
 ('SUNY, U. Albany', 2),
 ('Air Force Institute of Technology', 1),
 ('Texas Tech U.', 2),
 ('SUNY, Binghamton U.', 1),
 ('U. Texas, El Paso', 1),
 ('Colorado State U., Fort Collins', 2),
 ('U. Delaware', 2),
 ('San Diego State U.', 1),
 ('Washington State U.', 2),
 ('Drexel U.', 2),
 ('U. Texas, Arlington', 1),
 ('Worcester Polytechnic Institute', 1),
 ('Kansas State U.', 2),
 ('U. North Texas, Denton', 1),
 ('U. North Carolina, Charlotte', 1),
 ('Tufts U.', 2),
 ('U. New Mexico', 2),
 ('Dartmouth C.', 2),
 ('U. California, Riverside', 2),
 ('Louisiana State U., Baton Rouge', 2),
 ('U. Connecticut', 2),
 ('DePaul U.', 1),
 ('Wright State U.', 1),
 ('Temple U.', 2),
 ('Syracuse U.', 1),
 ('U. Oregon', 1),
 ('North Dakota State U.', 2),
 ('Washington U., Saint Louis', 5),
 ('U. Missouri, Columbia', 2),
 ('U. Texas, San Antonio', 1),
 ('Oklahoma State U., Stillwater', 2),
 ('Rochester Institute of Technology', 1),
 ('U. Massachusetts, Lowell', 1),
 ('Wayne State U.', 2),
 ('U. California, Santa Cruz', 2),
 ('U.S. Air Force Academy', 1),
 ('Louisiana Tech U.', 1),
 ('U. South Carolina, Columbia', 2),
 ('Michigan Technological U.', 1),
 ('U. Kansas', 2),
 ('Indiana U.-Purdue U., Indianapolis', 1),
 ('U. Tulsa', 1),
 ('U. Rochester', 3),
 ('U. Memphis', 1),
 ('U. Kentucky', 3),
 ('Brandeis U.', 1),
 ('Georgia State U.', 1),
 ('Boise State U.', 1),
 ('Illinois Institute of Technology', 1),
 ('U. Massachusetts, Boston', 1),
 ('Utah State U.', 2),
 ('U. Idaho', 2),
 ('North Carolina Agricultural and Technical State U.', 1),
 ('Brigham Young U., Provo', 1),
 ('U. Iowa', 3),
 ('U. Nebraska, Omaha', 1),
 ('Toyota Technological Institute, Chicago', 1),
 ('Georgetown U.', 2),
 ('U. Dayton', 2),
 ('Kent State U.', 1),
 ('U. Miami', 3),
 ('Stevens Institute of Technology', 1),
 ('Texas State U.', 1),
 ('California State U., San Bernardino', 1),
 ('Northwestern U.', 5),
 ('Portland State U.', 1),
 ('Emory U.', 5),
 ('C. of William and Mary and Virginia Institute of Marine Science', 1),
 ('CUNY, City C.', 1),
 ('Missouri U. of Science and Technology', 1),
 ('New Mexico State U.', 2),
 ('Tulane U.', 2),
 ('Old Dominion U.', 1),
 ('Jackson State U.', 1),
 ('U. Oklahoma, Norman and Health Science Center', 2),
 ('Southern Methodist U.', 1),
 ('U. Nevada, Reno', 1),
 ('U. Alaska, Fairbanks', 1),
 ('San Francisco State U.', 1),
 ('U. Wisconsin-Milwaukee', 1),
 ('Virginia Commonwealth U.', 2),
 ('U. Cincinnati', 3),
 ('U. New Hampshire', 1),
 ('U. Missouri, Kansas City', 1),
 ('U. Wyoming', 1),
 ('U.S. Naval Academy', 1),
 ('Texas A&M U.-Corpus Christi', 1),
 ('U. North Carolina, general administration', 1),
 ('West Virginia U.', 2),
 ('U. Arkansas, Little Rock', 1),
 ('Harvey Mudd C.', 1),
 ('U. California, Office of the President', 1),
 ('U. Colorado Colorado Springs', 1),
 ('Lehigh U.', 1),
 ('U.S. Military Academy', 1),
 ('U. New Orleans', 1),
 ('American U.', 1),
 ('Delaware State U.', 1),
 ('Montana State U., Bozeman', 1),
 ('Baylor U.', 1),
 ('Villanova U.', 1),
 ('Bryn Mawr C.', 1),
 ('U. Massachusetts, Dartmouth', 1),
 ('U. Vermont', 2),
 ('Vanderbilt U.', 5),
 ('Alabama A&M U.', 1),
 ('Northern Arizona U.', 1),
 ('CUNY, Queens C.', 1),
 ('Florida Institute of Technology', 1),
 ('U. South Alabama', 1),
 ('U. Nevada, Las Vegas', 1),
 ('Gallaudet U.', 1),
 ('Bowie State U.', 1),
 ('Marquette U.', 1),
 ('U. Tennessee, Chattanooga', 1),
 ('California Polytechnic State U., San Luis Obispo', 1),
 ('U. Alabama, Birmingham', 5),
 ('Boston C.', 1),
 ('SUNY, Polytechnic Institute', 4),
 ('Tennessee Technological U.', 1),
 ('Tuskegee U.', 1),
 ('Desert Research Institute', 1),
 ('Clarkson U.', 1),
 ('Ball State U.', 1),
 ('Florida Atlantic U.', 1),
 ('Case Western Reserve U.', 3),
 ('Creighton U.', 1),
 ('Rutgers, State U. New Jersey, Newark', 1),
 ('U. Maine', 1),
 ('Howard U.', 1),
 ('U. Puerto Rico, Mayaguez', 1),
 ('California State U., Northridge', 1),
 ('Dakota State U.', 1),
 ('Elizabeth City State U.', 1),
 ('U. California, Merced', 1),
 ('Southern Illinois U., Carbondale', 1),
 ('Illinois State U.', 1),
 ('Norfolk State U.', 1),
 ('CUNY, system office', 1),
 ('Smith C.', 1),
 ('Western Washington U.', 1),
 ('U. Puerto Rico, Rio Piedras', 1),
 ('Williams C.', 1),
 ('South Dakota State U.', 1),
 ('Texas Southern U.', 1),
 ('Western Michigan U. and Homer Stryker M.D. School of Medicine', 1),
 ('Morgan State U.', 1),
 ('U. Montana, Missoula', 1),
 ('U. Akron', 1),
 ('Hampton U.', 1),
 ('Florida A&M U.', 1),
 ('U. Colorado Denver and Anschutz Medical Campus', 3),
 ('Miami U.', 1),
 ('California State U., Bakersfield', 1),
 ('U. South Dakota', 1),
 ('U. Texas, Brownsville', 1),
 ('Sam Houston State U.', 1),
 ('C. Charleston', 1),
 ('CUNY, Hunter C.', 1),
 ('U. Texas Pan American', 1),
 ('Oakland U.', 1),
 ('Stephen F. Austin State U.', 1),
 ('Loyola U., Chicago', 1),
 ('Willamette U.', 1),
 ('Northern Illinois U.', 1),
 ('Wellesley C.', 1),
 ('U. Louisville', 2),
 ('Fordham U.', 1),
 ('Towson U.', 1),
 ('U. Central Arkansas', 1),
 ('U. Mississippi', 2),
 ('U. Minnesota, Duluth', 1),
 ('Calvin C.', 1),
 ('California State U., San Marcos', 1),
 ('U. Houston-Downtown', 1),
 ('Tennessee State U.', 1),
 ('Pennsylvania State U., Harrisburg', 1),
 ('Pace U.', 1),
 ('U. Rhode Island', 1),
 ('Columbia U., Teachers C.', 1),
 ('California State U., Monterey Bay', 1),
 ('U. Southern Mississippi', 1),
 ('U. Denver', 1),
 ('Trinity C., Hartford', 1),
 ('Colorado School of Mines', 1),
 ('U. Metropolitana', 1),
 ('Idaho State U.', 1),
 ('U. Southern Maine', 1),
 ('U. Northern Colorado', 1),
 ('CUNY, John Jay C. of Criminal Justice', 1),
 ('Montclair State U.', 1),
 ('Prairie View A&M U.', 1),
 ('Pomona C.', 1),
 ('Carleton C.', 1),
 ('Duquesne U.', 1),
 ('California State U., Fresno', 1),
 ('Vassar C.', 1),
 ('Xavier U. Louisiana', 1),
 ('St. Olaf C.', 1),
 ('Cleveland State U.', 1),
 ('U. Arkansas, Pine Bluff', 1),
 ('Marist C.', 1),
 ('San Jose State U.', 1),
 ('CUNY, Brooklyn C.', 1),
 ('California State U., Sacramento', 1),
 ('U. North Carolina, Wilmington', 1),
 ('Fayetteville State U.', 1),
 ('New School', 1),
 ('Fairfield U.', 1),
 ('Mount Holyoke C.', 1),
 ('Alcorn State U.', 1),
 ('Wesleyan U.', 1),
 ('Auburn U., Auburn', 2),
 ('Lewis and Clark C.', 1),
 ('Rutgers, State U. New Jersey, Camden', 1),
 ('U. North Carolina, Greensboro', 1),
 ('Arkansas State U., Jonesboro', 1),
 ('Saint Louis U.', 1),
 ('U. North Florida', 1),
 ('Central Connecticut State U.', 1),
 ('Central Michigan U.', 1),
 ('CUNY, C. Staten Island', 1),
 ('Trinity U.', 1),
 ('Wake Forest U.', 2),
 ('Embry-Riddle Aeronautical U.', 1),
 ('Southern Illinois U., Edwardsville', 1),
 ('U. Washington, Bothell', 1),
 ('Southern U. and A&M C., Baton Rouge', 1),
 ('Amherst C.', 1),
 ('California State U., Channel Islands', 1),
 ('U. North Dakota', 1),
 ('Lafayette C.', 1),
 ('Purdue U., Calumet', 1),
 ('Loyola Marymount U.', 1),
 ('Clark Atlanta U.', 1),
 ('Spelman C.', 1),
 ('CUNY, Graduate Center', 1),
 ('Kean U.', 1),
 ('Southern Connecticut State U.', 1),
 ('Texas A&M International U.', 1),
 ('New Mexico Institute of Mining and Technology', 1),
 ('James Madison U.', 1),
 ('Virginia State U.', 1),
 ('U. Missouri, Saint Louis', 1),
 ('Salisbury U.', 1),
 ('Colgate U.', 1),
 ('Ohio U.', 1),
 ('West Chester U. Pennsylvania', 1),
 ('U. Houston-Clear Lake', 1),
 ('Texas A&M U.-Commerce', 1),
 ('Appalachian State U.', 1),
 ('U. Washington, Tacoma', 1),
 ('Pennsylvania State U., Behrend', 1),
 ('Georgia Southern U.', 1),
 ('U. Hawaii, Hilo', 1),
 ('East Tennessee State U.', 1),
 ('CUNY, Lehman C.', 1),
 ('Lamar U.', 1),
 ('Reed C.', 1),
 ('U. Wisconsin-Stevens Point', 1),
 ('New York Institute of Technology', 1),
 ('Bowdoin C.', 1),
 ('Barnard C.', 1),
 ('Claremont Graduate U.', 1),
 ('Macalester C.', 1),
 ('Bowling Green State U.', 1),
 ('Seattle U.', 1),
 ('Oberlin C.', 1),
 ('U. Arkansas, Fayetteville', 2),
 ('U. Central Oklahoma', 1),
 ('West Virginia State U.', 1),
 ('Kennesaw State U.', 1),
 ('Elon U.', 1),
 ('U. South Carolina, Aiken', 1),
 ('Wichita State U.', 1),
 ('Azusa Pacific U.', 1),
 ('Bates C.', 1),
 ('SUNY, C. of Environmental Science and Forestry', 1),
 ('West Texas A&M U.', 1),
 ('Benedict C.', 1),
 ('Morehouse C.', 1),
 ('U. Detroit Mercy', 1),
 ('Middle Tennessee State U.', 1),
 ('Valparaiso U.', 1),
 ('Grinnell C.', 1),
 ('Swarthmore C.', 1),
 ('Winthrop U.', 1),
 ('Grand Valley State U.', 1),
 ('C. of Saint Benedict', 1),
 ('California State U., Dominguez Hills', 1),
 ('Texas Christian U.', 1),
 ('C. Wooster', 1),
 ('U. West Florida', 1),
 ('Bradley U.', 1),
 ('Rowan U.', 1),
 ('Norwich U.', 1),
 ('U. Hartford', 1),
 ('La Salle U.', 1),
 ('U. del Turabo', 1),
 ('Siena C.', 1),
 ('Lincoln U., Jefferson City', 1),
 ("Saint John's U., Collegeville", 1),
 ('Bucknell U.', 1),
 ('Shaw U.', 1),
 ('Sonoma State U.', 1),
 ('Indiana U.-Purdue U., Fort Wayne', 1),
 ('U. Wisconsin-La Crosse', 1),
 ('U. San Francisco', 1),
 ('CUNY, Baruch C.', 1),
 ('U. Wisconsin-Oshkosh', 1),
 ('Kettering U.', 1),
 ('California State U., Long Beach', 1),
 ('Middlebury C.', 1),
 ('U. South Carolina, Beaufort', 1),
 ('Davidson C.', 1),
 ('Minnesota State U., Mankato', 1),
 ('Wiley C.', 1),
 ('East Central U.', 1),
 ('U. Baltimore', 1),
 ('Ithaca C.', 1),
 ('South Dakota School of Mines and Technology', 1),
 ('Western Kentucky U.', 1),
 ('South Carolina State U.', 1),
 ('Lawrence Technological U.', 1),
 ('Eastern Michigan U.', 1),
 ('Union C., Schenectady', 1),
 ("Saint Michael's C.", 1),
 ('U. Nebraska, Kearney', 1),
 ('U. Alaska, Anchorage', 1),
 ('Colorado C.', 1),
 ('U. West Georgia', 1),
 ('Florida Gulf Coast U.', 1),
 ('St. Cloud State U.', 1),
 ('U. Toledo', 1),
 ('Fort Valley State U.', 1),
 ("St. John's U., Manhattan", 1),
 ('Haverford C.', 1),
 ('U. Wisconsin-Green Bay', 1),
 ('Fisk U.', 1),
 ('U. of the Pacific', 1),
 ('U. Minnesota, Morris', 1),
 ('Missouri State U.', 1),
 ('Pepperdine U.', 1),
 ('CUNY, Medgar Evers C.', 1),
 ('Quinnipiac U.', 1),
 ('Hamilton C.', 1),
 ('East Carolina U.', 1),
 ('Hofstra U.', 1),
 ('U. Texas, Tyler', 1),
 ('Furman U.', 1),
 ('Colby C.', 1),
 ('SUNY, Geneseo', 1),
 ('Gonzaga U.', 1),
 ('U. Wisconsin-Platteville', 1),
 ('Hope C.', 1),
 ('California State U., Chico', 1),
 ('Claflin U.', 1),
 ('CUNY, York C.', 1),
 ('Suffolk U.', 1),
 ('Kentucky State U.', 1),
 ('California State U., Fullerton', 1),
 ('Skidmore C.', 1),
 ('Western Illinois U.', 1),
 ('Murray State U.', 1),
 ('Northern Kentucky U.', 1),
 ('McNeese State U.', 1),
 ('U. San Diego', 1),
 ('Savannah State U.', 1),
 ('Rider U.', 1),
 ('California State Polytechnic U., Pomona', 1),
 ('Indiana U., South Bend', 1),
 ('U. Richmond', 1),
 ('Eastern Connecticut State U.', 1),
 ('U. of the District of Columbia', 1),
 ('Marshall U.', 1),
 ('U. Northern Iowa', 1),
 ('Niagara U.', 1),
 ('Nova Southeastern U.', 1),
 ('Rhode Island School of Design', 1),
 ('U. of Mary Washington', 1),
 ('Chapman U.', 1),
 ('U. Michigan, Dearborn', 1),
 ('Roger Williams U.', 1),
 ('Hawaii Pacific U.', 1),
 ('Jacksonville State U.', 1),
 ("Texas Woman's U.", 1),
 ('Purdue U., North Central', 1),
 ('Central State U.', 1),
 ('Albany C. of Pharmacy and Health Sciences', 1),
 ('Franklin and Marshall C.', 1),
 ('Pacific U.', 1),
 ('Washington and Lee U.', 1),
 ('Saginaw Valley State U.', 1),
 ('Western Carolina U.', 1),
 ('Dickinson C.', 1),
 ("Saint Joseph's U.", 1),
 ('Coastal Carolina U.', 1),
 ('Wheaton C., Wheaton', 1),
 ('U. North Carolina, Asheville', 1),
 ('Youngstown State U.', 1),
 ('SUNY, C. Brockport', 1),
 ('Sewanee: U. of the South', 1),
 ('Santa Clara U.', 1),
 ('Lake Superior State U.', 1),
 ('U. California, San Francisco', 5),
 ('Baylor C. of Medicine', 5),
 ('Icahn School of Medicine at Mt. Sinai', 5),
 ('U. Texas Southwestern Medical Center', 5),
 ('U. Maryland, Baltimore', 3),
 ('Scripps Research Institute', 3),
 ('Uniformed Services U. of the Health Sciences', 3),
 ('Oregon Health and Science U.', 3),
 ('Yeshiva U.', 3),
 ('U. Massachusetts, Medical School', 3),
 ('Medical U. South Carolina', 3),
 ('U. Texas Health Science Center, Houston', 3),
 ('Woods Hole Oceanographic Institution', 2),
 ('Medical C. Wisconsin', 2),
 ('U. Texas Medical Branch', 2),
 ('U. Texas Health Science Center, San Antonio', 2),
 ('U. Nebraska, Medical Center', 2),
 ('U. Arkansas for Medical Sciences', 2),
 ('Thomas Jefferson U.', 2),
 ('Cold Spring Harbor Laboratory', 2),
 ('Rush U.', 2),
 ('Georgia Regents U.\t\t\t\t\t\t\t\t\t', 2),
 ('U. Tennessee, Knoxville, Institute of Agriculture', 1),
 ('U. Tennessee, Health Science Center', 2),
 ('U. Maryland, Center for Environmental Science', 1),
 ('Louisiana State U., Health Sciences Center \xe2\x80\x93 New Orleans', 1),
 ('U. North Texas, Health Science Center', 1),
 ('U. Puerto Rico, Medical Sciences Campus', 1),
 ('Eastern Virginia Medical School', 1),
 ('Van Andel Institute', 1),
 ('Texas Tech U., Health Sciences Center', 1),
 ('Morehouse School of Medicine', 1),
 ('Mercer U.', 1),
 ('SUNY, Upstate Medical U.', 1),
 ('SUNY, Downstate Medical Center', 1),
 ('Loma Linda U.', 1),
 ('Catholic U. of America', 1),
 ('Louisiana State U., Health Sciences Center - Shreveport', 1),
 ('Texas A&M U.-Kingsville', 1),
 ('U. of the Virgin Islands', 1),
 ('Texas Tech U., Health Sciences Center, El Paso', 1),
 ('Albany Medical C.', 1),
 ('New York Medical C.', 1),
 ('Northeast Ohio Medical U.', 1),
 ('Rosalind Franklin U. of Medicine and Science', 1),
 ('Charles R. Drew U. of Medicine and Science', 1),
 ('Meharry Medical C.', 1),
 ('North Carolina Central U.', 1),
 ('U. Texas Health Science Center, Tyler', 1),
 ('Montana Tech of U. Montana', 1),
 ('Humboldt State U.', 1),
 ('Rhode Island C.', 1),
 ('Tarleton State U.', 1),
 ('U. Maryland, Eastern Shore', 1),
 ("St. Edward's U.", 1),
 ('Dillard U.', 1),
 ('Midwestern U.', 1),
 ('Langston U.', 1),
 ('Ponce Health Sciences U.', 1),
 ('U. New England', 1),
 ('U. South Florida, Saint Petersburg', 1),
 ('Seton Hall U.', 1),
 ('Clark U.', 1),
 ('U. Central del Caribe', 1),
 ('U. Guam', 1),
 ('National Defense U.', 1),
 ('Alfred U.', 1),
 ('U. Massachusetts, central office', 1),
 ('California State U., Los Angeles', 1),
 ('Western U. of Health Sciences', 1),
 ('Southern U. and A&M C., Agricultural Research and Extension Center', 1),
 ('A. T. Still U.', 1),
 ('Milwaukee School of Engineering', 1),
 ('U. Oklahoma, Tulsa', 1),
 ('U. of the Sciences Philadelphia', 1),
 ('MGH Institute of Health Professions', 1),
 ('SUNY, C. of Optometry', 1),
 ('Roseman U. of Health Sciences', 1),
 ('Mills C.', 1),
 ('Touro U., Vallejo', 1),
 ('Memorial Sloan Kettering Cancer Center, Louis V. Gerstner Jr. Graduate S. of Biomedical Sciences',
  1),
 ('Fuller Theological Seminary', 1),
 ('Eastern Washington U.', 1),
 ('Plymouth State U.', 1),
 ('Tougaloo C.', 1),
 ('Southeastern Louisiana U.', 1),
 ('Naval War C.', 1),
 ('Central Washington U.', 1),
 ('Philadelphia C. of Osteopathic Medicine', 1),
 ('Northwest Indian C.', 1),
 ('Black Hills State U.', 1),
 ('Erikson Institute', 1),
 ('SUNY, Buffalo State', 1),
 ('National U.', 1),
 ('Alabama State U.', 1),
 ('Edward Via C. of Osteopathic Medicine', 1),
 ('Pittsburg State U.', 1),
 ('Sul Ross State U.', 1),
 ('Commonwealth Medical C.', 1),
 ('Franklin W. Olin C. of Engineering', 1),
 ('Austin Peay State U.', 1),
 ('U. Illinois, Springfield', 1),
 ('Oregon Institute of Technology', 1),
 ('Mercyhurst U.', 1),
 ('Connecticut C.', 1),
 ('Grambling State U.', 1),
 ('Morehead State U.', 1),
 ('U. New Haven', 1),
 ('U. Texas, Permian Basin', 1),
 ('Keck Graduate Institute', 1),
 ('Wheeling Jesuit U.', 1),
 ('Palmer C. of Chiropractic, Davenport', 1),
 ('Eastern Kentucky U.', 1),
 ('U. Louisiana, Monroe', 1),
 ('Oklahoma State U., Center for Health Sciences', 1),
 ('U. South Florida, Sarasota-Manatee', 1),
 ('Nicholls State U.', 1),
 ('U. Alaska, Southeast', 1),
 ('Occidental C.', 1),
 ('California Maritime Academy', 1),
 ('Indiana State U.', 1),
 ('U. Puerto Rico, Cayey', 1),
 ('Maine Maritime Academy', 1),
 ('Marshall B. Ketchum U.', 1),
 ('Stockton U.', 1),
 ('U. Tampa', 1),
 ('Bastyr U.', 1),
 ('Albany State U.', 1),
 ('American Samoa Community C.', 1),
 ('Providence C.', 1),
 ('Emerson C.', 1),
 ('Salus U.', 1),
 ('Alaska Pacific U.', 1),
 ('Christopher Newport U.', 1),
 ('Augsburg C.', 1),
 ('Salish Kootenai C.', 1),
 ('U. del Este', 1),
 ('La Sierra U.', 1),
 ('SUNY, C. Plattsburgh', 1),
 ('Heidelberg U.', 1),
 ('SUNY, Farmingdale State C.', 1),
 ('Seattle Pacific U.', 1),
 ('California State U., Stanislaus', 1),
 ('New England C. of Optometry', 1),
 ('U. Western States', 1),
 ('U. Houston system administration', 1),
 ('Keene State C.', 1),
 ('Hobart and William Smith Colleges', 1),
 ('U. of the Incarnate Word', 1),
 ('U. Puerto Rico, Humacao', 1),
 ('Augustana C., Sioux Falls', 1),
 ('Barry U.', 1),
 ('CUNY, Advanced Science Research Center', 1),
 ('U. Redlands', 1),
 ('Doane C.', 1),
 ('Florida Polytechnic U.', 1)]
In [43]:
clust_zip[0][1]
Out[43]:
6
In [59]:
cluster_dict = {}
for i in range(1,7):
    clustername = []
    for j,v in enumerate(clust_zip):
        if clust_zip[j][1]==i:
            clustername.append(clust_zip[j][0])
    cluster_dict[i] = clustername
cluster_dict
Out[59]:
{1: ['U. Alabama, Huntsville',
  'George Mason U.',
  'U. Alabama, Tuscaloosa',
  'Northeastern U.',
  'U. Louisiana, Lafayette',
  'U. Texas, Dallas',
  'U. Maryland, Baltimore County',
  'Naval Postgraduate School',
  'Air Force Institute of Technology',
  'SUNY, Binghamton U.',
  'U. Texas, El Paso',
  'San Diego State U.',
  'U. Texas, Arlington',
  'Worcester Polytechnic Institute',
  'U. North Texas, Denton',
  'U. North Carolina, Charlotte',
  'DePaul U.',
  'Wright State U.',
  'Syracuse U.',
  'U. Oregon',
  'U. Texas, San Antonio',
  'Rochester Institute of Technology',
  'U. Massachusetts, Lowell',
  'U.S. Air Force Academy',
  'Louisiana Tech U.',
  'Michigan Technological U.',
  'Indiana U.-Purdue U., Indianapolis',
  'U. Tulsa',
  'U. Memphis',
  'Brandeis U.',
  'Georgia State U.',
  'Boise State U.',
  'Illinois Institute of Technology',
  'U. Massachusetts, Boston',
  'North Carolina Agricultural and Technical State U.',
  'Brigham Young U., Provo',
  'U. Nebraska, Omaha',
  'Toyota Technological Institute, Chicago',
  'Kent State U.',
  'Stevens Institute of Technology',
  'Texas State U.',
  'California State U., San Bernardino',
  'Portland State U.',
  'C. of William and Mary and Virginia Institute of Marine Science',
  'CUNY, City C.',
  'Missouri U. of Science and Technology',
  'Old Dominion U.',
  'Jackson State U.',
  'Southern Methodist U.',
  'U. Nevada, Reno',
  'U. Alaska, Fairbanks',
  'San Francisco State U.',
  'U. Wisconsin-Milwaukee',
  'U. New Hampshire',
  'U. Missouri, Kansas City',
  'U. Wyoming',
  'U.S. Naval Academy',
  'Texas A&M U.-Corpus Christi',
  'U. North Carolina, general administration',
  'U. Arkansas, Little Rock',
  'Harvey Mudd C.',
  'U. California, Office of the President',
  'U. Colorado Colorado Springs',
  'Lehigh U.',
  'U.S. Military Academy',
  'U. New Orleans',
  'American U.',
  'Delaware State U.',
  'Montana State U., Bozeman',
  'Baylor U.',
  'Villanova U.',
  'Bryn Mawr C.',
  'U. Massachusetts, Dartmouth',
  'Alabama A&M U.',
  'Northern Arizona U.',
  'CUNY, Queens C.',
  'Florida Institute of Technology',
  'U. South Alabama',
  'U. Nevada, Las Vegas',
  'Gallaudet U.',
  'Bowie State U.',
  'Marquette U.',
  'U. Tennessee, Chattanooga',
  'California Polytechnic State U., San Luis Obispo',
  'Boston C.',
  'Tennessee Technological U.',
  'Tuskegee U.',
  'Desert Research Institute',
  'Clarkson U.',
  'Ball State U.',
  'Florida Atlantic U.',
  'Creighton U.',
  'Rutgers, State U. New Jersey, Newark',
  'U. Maine',
  'Howard U.',
  'U. Puerto Rico, Mayaguez',
  'California State U., Northridge',
  'Dakota State U.',
  'Elizabeth City State U.',
  'U. California, Merced',
  'Southern Illinois U., Carbondale',
  'Illinois State U.',
  'Norfolk State U.',
  'CUNY, system office',
  'Smith C.',
  'Western Washington U.',
  'U. Puerto Rico, Rio Piedras',
  'Williams C.',
  'South Dakota State U.',
  'Texas Southern U.',
  'Western Michigan U. and Homer Stryker M.D. School of Medicine',
  'Morgan State U.',
  'U. Montana, Missoula',
  'U. Akron',
  'Hampton U.',
  'Florida A&M U.',
  'Miami U.',
  'California State U., Bakersfield',
  'U. South Dakota',
  'U. Texas, Brownsville',
  'Sam Houston State U.',
  'C. Charleston',
  'CUNY, Hunter C.',
  'U. Texas Pan American',
  'Oakland U.',
  'Stephen F. Austin State U.',
  'Loyola U., Chicago',
  'Willamette U.',
  'Northern Illinois U.',
  'Wellesley C.',
  'Fordham U.',
  'Towson U.',
  'U. Central Arkansas',
  'U. Minnesota, Duluth',
  'Calvin C.',
  'California State U., San Marcos',
  'U. Houston-Downtown',
  'Tennessee State U.',
  'Pennsylvania State U., Harrisburg',
  'Pace U.',
  'U. Rhode Island',
  'Columbia U., Teachers C.',
  'California State U., Monterey Bay',
  'U. Southern Mississippi',
  'U. Denver',
  'Trinity C., Hartford',
  'Colorado School of Mines',
  'U. Metropolitana',
  'Idaho State U.',
  'U. Southern Maine',
  'U. Northern Colorado',
  'CUNY, John Jay C. of Criminal Justice',
  'Montclair State U.',
  'Prairie View A&M U.',
  'Pomona C.',
  'Carleton C.',
  'Duquesne U.',
  'California State U., Fresno',
  'Vassar C.',
  'Xavier U. Louisiana',
  'St. Olaf C.',
  'Cleveland State U.',
  'U. Arkansas, Pine Bluff',
  'Marist C.',
  'San Jose State U.',
  'CUNY, Brooklyn C.',
  'California State U., Sacramento',
  'U. North Carolina, Wilmington',
  'Fayetteville State U.',
  'New School',
  'Fairfield U.',
  'Mount Holyoke C.',
  'Alcorn State U.',
  'Wesleyan U.',
  'Lewis and Clark C.',
  'Rutgers, State U. New Jersey, Camden',
  'U. North Carolina, Greensboro',
  'Arkansas State U., Jonesboro',
  'Saint Louis U.',
  'U. North Florida',
  'Central Connecticut State U.',
  'Central Michigan U.',
  'CUNY, C. Staten Island',
  'Trinity U.',
  'Embry-Riddle Aeronautical U.',
  'Southern Illinois U., Edwardsville',
  'U. Washington, Bothell',
  'Southern U. and A&M C., Baton Rouge',
  'Amherst C.',
  'California State U., Channel Islands',
  'U. North Dakota',
  'Lafayette C.',
  'Purdue U., Calumet',
  'Loyola Marymount U.',
  'Clark Atlanta U.',
  'Spelman C.',
  'CUNY, Graduate Center',
  'Kean U.',
  'Southern Connecticut State U.',
  'Texas A&M International U.',
  'New Mexico Institute of Mining and Technology',
  'James Madison U.',
  'Virginia State U.',
  'U. Missouri, Saint Louis',
  'Salisbury U.',
  'Colgate U.',
  'Ohio U.',
  'West Chester U. Pennsylvania',
  'U. Houston-Clear Lake',
  'Texas A&M U.-Commerce',
  'Appalachian State U.',
  'U. Washington, Tacoma',
  'Pennsylvania State U., Behrend',
  'Georgia Southern U.',
  'U. Hawaii, Hilo',
  'East Tennessee State U.',
  'CUNY, Lehman C.',
  'Lamar U.',
  'Reed C.',
  'U. Wisconsin-Stevens Point',
  'New York Institute of Technology',
  'Bowdoin C.',
  'Barnard C.',
  'Claremont Graduate U.',
  'Macalester C.',
  'Bowling Green State U.',
  'Seattle U.',
  'Oberlin C.',
  'U. Central Oklahoma',
  'West Virginia State U.',
  'Kennesaw State U.',
  'Elon U.',
  'U. South Carolina, Aiken',
  'Wichita State U.',
  'Azusa Pacific U.',
  'Bates C.',
  'SUNY, C. of Environmental Science and Forestry',
  'West Texas A&M U.',
  'Benedict C.',
  'Morehouse C.',
  'U. Detroit Mercy',
  'Middle Tennessee State U.',
  'Valparaiso U.',
  'Grinnell C.',
  'Swarthmore C.',
  'Winthrop U.',
  'Grand Valley State U.',
  'C. of Saint Benedict',
  'California State U., Dominguez Hills',
  'Texas Christian U.',
  'C. Wooster',
  'U. West Florida',
  'Bradley U.',
  'Rowan U.',
  'Norwich U.',
  'U. Hartford',
  'La Salle U.',
  'U. del Turabo',
  'Siena C.',
  'Lincoln U., Jefferson City',
  "Saint John's U., Collegeville",
  'Bucknell U.',
  'Shaw U.',
  'Sonoma State U.',
  'Indiana U.-Purdue U., Fort Wayne',
  'U. Wisconsin-La Crosse',
  'U. San Francisco',
  'CUNY, Baruch C.',
  'U. Wisconsin-Oshkosh',
  'Kettering U.',
  'California State U., Long Beach',
  'Middlebury C.',
  'U. South Carolina, Beaufort',
  'Davidson C.',
  'Minnesota State U., Mankato',
  'Wiley C.',
  'East Central U.',
  'U. Baltimore',
  'Ithaca C.',
  'South Dakota School of Mines and Technology',
  'Western Kentucky U.',
  'South Carolina State U.',
  'Lawrence Technological U.',
  'Eastern Michigan U.',
  'Union C., Schenectady',
  "Saint Michael's C.",
  'U. Nebraska, Kearney',
  'U. Alaska, Anchorage',
  'Colorado C.',
  'U. West Georgia',
  'Florida Gulf Coast U.',
  'St. Cloud State U.',
  'U. Toledo',
  'Fort Valley State U.',
  "St. John's U., Manhattan",
  'Haverford C.',
  'U. Wisconsin-Green Bay',
  'Fisk U.',
  'U. of the Pacific',
  'U. Minnesota, Morris',
  'Missouri State U.',
  'Pepperdine U.',
  'CUNY, Medgar Evers C.',
  'Quinnipiac U.',
  'Hamilton C.',
  'East Carolina U.',
  'Hofstra U.',
  'U. Texas, Tyler',
  'Furman U.',
  'Colby C.',
  'SUNY, Geneseo',
  'Gonzaga U.',
  'U. Wisconsin-Platteville',
  'Hope C.',
  'California State U., Chico',
  'Claflin U.',
  'CUNY, York C.',
  'Suffolk U.',
  'Kentucky State U.',
  'California State U., Fullerton',
  'Skidmore C.',
  'Western Illinois U.',
  'Murray State U.',
  'Northern Kentucky U.',
  'McNeese State U.',
  'U. San Diego',
  'Savannah State U.',
  'Rider U.',
  'California State Polytechnic U., Pomona',
  'Indiana U., South Bend',
  'U. Richmond',
  'Eastern Connecticut State U.',
  'U. of the District of Columbia',
  'Marshall U.',
  'U. Northern Iowa',
  'Niagara U.',
  'Nova Southeastern U.',
  'Rhode Island School of Design',
  'U. of Mary Washington',
  'Chapman U.',
  'U. Michigan, Dearborn',
  'Roger Williams U.',
  'Hawaii Pacific U.',
  'Jacksonville State U.',
  "Texas Woman's U.",
  'Purdue U., North Central',
  'Central State U.',
  'Albany C. of Pharmacy and Health Sciences',
  'Franklin and Marshall C.',
  'Pacific U.',
  'Washington and Lee U.',
  'Saginaw Valley State U.',
  'Western Carolina U.',
  'Dickinson C.',
  "Saint Joseph's U.",
  'Coastal Carolina U.',
  'Wheaton C., Wheaton',
  'U. North Carolina, Asheville',
  'Youngstown State U.',
  'SUNY, C. Brockport',
  'Sewanee: U. of the South',
  'Santa Clara U.',
  'Lake Superior State U.',
  'U. Tennessee, Knoxville, Institute of Agriculture',
  'U. Maryland, Center for Environmental Science',
  'Louisiana State U., Health Sciences Center \xe2\x80\x93 New Orleans',
  'U. North Texas, Health Science Center',
  'U. Puerto Rico, Medical Sciences Campus',
  'Eastern Virginia Medical School',
  'Van Andel Institute',
  'Texas Tech U., Health Sciences Center',
  'Morehouse School of Medicine',
  'Mercer U.',
  'SUNY, Upstate Medical U.',
  'SUNY, Downstate Medical Center',
  'Loma Linda U.',
  'Catholic U. of America',
  'Louisiana State U., Health Sciences Center - Shreveport',
  'Texas A&M U.-Kingsville',
  'U. of the Virgin Islands',
  'Texas Tech U., Health Sciences Center, El Paso',
  'Albany Medical C.',
  'New York Medical C.',
  'Northeast Ohio Medical U.',
  'Rosalind Franklin U. of Medicine and Science',
  'Charles R. Drew U. of Medicine and Science',
  'Meharry Medical C.',
  'North Carolina Central U.',
  'U. Texas Health Science Center, Tyler',
  'Montana Tech of U. Montana',
  'Humboldt State U.',
  'Rhode Island C.',
  'Tarleton State U.',
  'U. Maryland, Eastern Shore',
  "St. Edward's U.",
  'Dillard U.',
  'Midwestern U.',
  'Langston U.',
  'Ponce Health Sciences U.',
  'U. New England',
  'U. South Florida, Saint Petersburg',
  'Seton Hall U.',
  'Clark U.',
  'U. Central del Caribe',
  'U. Guam',
  'National Defense U.',
  'Alfred U.',
  'U. Massachusetts, central office',
  'California State U., Los Angeles',
  'Western U. of Health Sciences',
  'Southern U. and A&M C., Agricultural Research and Extension Center',
  'A. T. Still U.',
  'Milwaukee School of Engineering',
  'U. Oklahoma, Tulsa',
  'U. of the Sciences Philadelphia',
  'MGH Institute of Health Professions',
  'SUNY, C. of Optometry',
  'Roseman U. of Health Sciences',
  'Mills C.',
  'Touro U., Vallejo',
  'Memorial Sloan Kettering Cancer Center, Louis V. Gerstner Jr. Graduate S. of Biomedical Sciences',
  'Fuller Theological Seminary',
  'Eastern Washington U.',
  'Plymouth State U.',
  'Tougaloo C.',
  'Southeastern Louisiana U.',
  'Naval War C.',
  'Central Washington U.',
  'Philadelphia C. of Osteopathic Medicine',
  'Northwest Indian C.',
  'Black Hills State U.',
  'Erikson Institute',
  'SUNY, Buffalo State',
  'National U.',
  'Alabama State U.',
  'Edward Via C. of Osteopathic Medicine',
  'Pittsburg State U.',
  'Sul Ross State U.',
  'Commonwealth Medical C.',
  'Franklin W. Olin C. of Engineering',
  'Austin Peay State U.',
  'U. Illinois, Springfield',
  'Oregon Institute of Technology',
  'Mercyhurst U.',
  'Connecticut C.',
  'Grambling State U.',
  'Morehead State U.',
  'U. New Haven',
  'U. Texas, Permian Basin',
  'Keck Graduate Institute',
  'Wheeling Jesuit U.',
  'Palmer C. of Chiropractic, Davenport',
  'Eastern Kentucky U.',
  'U. Louisiana, Monroe',
  'Oklahoma State U., Center for Health Sciences',
  'U. South Florida, Sarasota-Manatee',
  'Nicholls State U.',
  'U. Alaska, Southeast',
  'Occidental C.',
  'California Maritime Academy',
  'Indiana State U.',
  'U. Puerto Rico, Cayey',
  'Maine Maritime Academy',
  'Marshall B. Ketchum U.',
  'Stockton U.',
  'U. Tampa',
  'Bastyr U.',
  'Albany State U.',
  'American Samoa Community C.',
  'Providence C.',
  'Emerson C.',
  'Salus U.',
  'Alaska Pacific U.',
  'Christopher Newport U.',
  'Augsburg C.',
  'Salish Kootenai C.',
  'U. del Este',
  'La Sierra U.',
  'SUNY, C. Plattsburgh',
  'Heidelberg U.',
  'SUNY, Farmingdale State C.',
  'Seattle Pacific U.',
  'California State U., Stanislaus',
  'New England C. of Optometry',
  'U. Western States',
  'U. Houston system administration',
  'Keene State C.',
  'Hobart and William Smith Colleges',
  'U. of the Incarnate Word',
  'U. Puerto Rico, Humacao',
  'Augustana C., Sioux Falls',
  'Barry U.',
  'CUNY, Advanced Science Research Center',
  'U. Redlands',
  'Doane C.',
  'Florida Polytechnic U.'],
 2: ['Carnegie Mellon U.',
  'U. Texas, Austin',
  'U. Maryland, College Park',
  'Brown U.',
  'U. Central Florida',
  'U. Tennessee, Knoxville',
  'U. Hawaii, Manoa',
  'U. Massachusetts, Amherst',
  'Arizona State U.',
  'Princeton U.',
  'Iowa State U.',
  'SUNY, Stony Brook U.',
  'U. California, Irvine',
  'Rice U.',
  'Florida International U.',
  'U. Nebraska, Lincoln',
  'Rensselaer Polytechnic Institute',
  'U. Notre Dame',
  'U. California, Santa Barbara',
  'Mississippi State U.',
  'Florida State U.',
  'New Jersey Institute of Technology',
  'California Institute of Technology',
  'U. Colorado Boulder',
  'Oregon State U.',
  'Clemson U.',
  'U. Houston',
  'George Washington U.',
  'SUNY, U. Albany',
  'Texas Tech U.',
  'Colorado State U., Fort Collins',
  'U. Delaware',
  'Washington State U.',
  'Drexel U.',
  'Kansas State U.',
  'Tufts U.',
  'U. New Mexico',
  'Dartmouth C.',
  'U. California, Riverside',
  'Louisiana State U., Baton Rouge',
  'U. Connecticut',
  'Temple U.',
  'North Dakota State U.',
  'U. Missouri, Columbia',
  'Oklahoma State U., Stillwater',
  'Wayne State U.',
  'U. California, Santa Cruz',
  'U. South Carolina, Columbia',
  'U. Kansas',
  'Utah State U.',
  'U. Idaho',
  'Georgetown U.',
  'U. Dayton',
  'New Mexico State U.',
  'Tulane U.',
  'U. Oklahoma, Norman and Health Science Center',
  'Virginia Commonwealth U.',
  'West Virginia U.',
  'U. Vermont',
  'U. Louisville',
  'U. Mississippi',
  'Auburn U., Auburn',
  'Wake Forest U.',
  'U. Arkansas, Fayetteville',
  'Woods Hole Oceanographic Institution',
  'Medical C. Wisconsin',
  'U. Texas Medical Branch',
  'U. Texas Health Science Center, San Antonio',
  'U. Nebraska, Medical Center',
  'U. Arkansas for Medical Sciences',
  'Thomas Jefferson U.',
  'Cold Spring Harbor Laboratory',
  'Rush U.',
  'Georgia Regents U.\t\t\t\t\t\t\t\t\t',
  'U. Tennessee, Health Science Center'],
 3: ['U. Southern California',
  'U. Utah',
  'Indiana U., Bloomington',
  'U. Chicago',
  'Rutgers, State U. New Jersey, New Brunswick',
  'New York U.',
  'U. Illinois, Chicago',
  'SUNY, U. Buffalo',
  'U. Georgia',
  'Rockefeller U.',
  'U. South Florida, Tampa',
  'Boston U.',
  'U. Virginia, Charlottesville',
  'U. Rochester',
  'U. Kentucky',
  'U. Iowa',
  'U. Miami',
  'U. Cincinnati',
  'Case Western Reserve U.',
  'U. Colorado Denver and Anschutz Medical Campus',
  'U. Maryland, Baltimore',
  'Scripps Research Institute',
  'Uniformed Services U. of the Health Sciences',
  'Oregon Health and Science U.',
  'Yeshiva U.',
  'U. Massachusetts, Medical School',
  'Medical U. South Carolina',
  'U. Texas Health Science Center, Houston'],
 4: ['U. Illinois, Urbana-Champaign',
  'Georgia Institute of Technology',
  'Massachusetts Institute of Technology',
  'Pennsylvania State U., University Park and Hershey Medical Center',
  'North Carolina State U.',
  'Virginia Polytechnic Institute and State U.',
  'Purdue U., West Lafayette',
  'Texas A&M U., College Station and Health Science Center',
  'Michigan State U.',
  'U. California, Berkeley',
  'U. Arizona',
  'SUNY, Polytechnic Institute'],
 5: ['U. California, San Diego',
  'U. Minnesota, Twin Cities',
  'U. Wisconsin-Madison',
  'Ohio State U.',
  'Stanford U.',
  'U. California, Los Angeles',
  'Columbia U. in the City of New York',
  'U. Michigan, Ann Arbor',
  'U. North Carolina, Chapel Hill',
  'U. Washington, Seattle',
  'Cornell U.',
  'Duke U.',
  'U. Texas M. D. Anderson Cancer Center',
  'U. Pennsylvania',
  'Harvard U.',
  'U. California, Davis',
  'Yale U.',
  'U. Florida',
  'U. Pittsburgh, Pittsburgh',
  'Washington U., Saint Louis',
  'Northwestern U.',
  'Emory U.',
  'Vanderbilt U.',
  'U. Alabama, Birmingham',
  'U. California, San Francisco',
  'Baylor C. of Medicine',
  'Icahn School of Medicine at Mt. Sinai',
  'U. Texas Southwestern Medical Center'],
 6: ['Johns Hopkins U.']}
In [72]:
for i in cluster_dict[6]:
    print str(i)+";",
Johns Hopkins U.;
In [67]:
for i in cluster_dict:
    print i, len(cluster_dict[i])
 1 496
2 75
3 28
4 12
5 28
6 1