Notebook

Step 1: Load and plot the data¶

In [21]:

import operator

# read data
with open( "imdb.tsv", "r" ) as txt:
    lines = txt.readlines()

# remove line breaks
lines = [ line.strip() for line in lines ]

# break each line into tokens
tuples = [ line.split( "\t" ) for line in lines ]

# remove the column names from the results
attributes = tuples.pop( 0 )

# sort the whole tuples in terms of country name
tuples = sorted(tuples, key=operator.itemgetter(4))

# make separate lists for each column of txt file and remove line breaks
ranks     = [ int(tuple[0].strip())  for tuple in tuples ]
titles    = [ tuple[1].strip()       for tuple in tuples ]
years     = [ int(tuple[2].strip() ) for tuple in tuples ]
directors = [ tuple[3].strip()       for tuple in tuples ]
countries = [ tuple[4].strip()       for tuple in tuples ]

figsize(6,6)

title("Best 30 Movies of All Time")
xlabel("Year of Release")
ylabel("Ranking (lower is better)")
scatter(years, ranks, c="blue", s=100, marker="o");

Step 2: Plot the data by country¶

In [36]:

# pick the country names without duplicates
unique_countries = sorted(list(set(countries)))

# match ranking position to dot size
min_dot_size   = 100
max_dot_size   = 500
dot_size_range = max_dot_size - min_dot_size
min_rank       = 1
max_rank       = max(ranks)

# define plot properties
title("Best 30 Movies of All Time by Country")
xlabel("Year of Release")
ylabel("Ranking (lower is better)")
grid(True, which="both")
xlim((1920, 2040))

figsize(10,6)

color_map = matplotlib.cm.get_cmap("Paired")

figsize(10,6)

# plot dots for every country separately
for country_to_plot_index in range( len( unique_countries ) ):
    
    years_to_plot = [ years[i] for i in range(len(tuples)) if countries[i]==unique_countries[country_to_plot_index] ]
    ranks_to_plot = [ ranks[i] for i in range(len(tuples)) if countries[i]==unique_countries[country_to_plot_index] ]    
                
    scatter(years_to_plot,
            ranks_to_plot,
            c = color_map( float( country_to_plot_index ) / len( unique_countries ) ),
            s = 100,
            marker = "o",
            label = unique_countries[ country_to_plot_index ] );

legend(loc="lower right", ncol=1);