#!/usr/bin/env python # coding: utf-8 # [Oregon Curriculum Network](http://4dsolutions.net/ocn/) # # [Home](School_of_Tomorrow.ipynb) # # # Data Visualization (Part One) # #

# Cleveland High School, Portland, Oregon #

# # # ## Introduction to Data Science # # In entering the realm of Data Science, we come upon a world concerned with predicting the future, anticipating what's next, based on extrapolation and sometimes interpolation. Many data science practices inherit from the insurance industry, which is about assessing and socializing (spreading the costs of) risk. # # We predict about the past as well. We're often keen to know of events that may have already taken place. # # ### Andragogy / Pedagogy # # Statisticians talk a lot about sampling a population, where the latter is what we wish to accurately characterize, but we haven't the means to survey all the data. The algorithms make a distinction depending on whether the entire population and/or samples thereof are being spoken about. # In[1]: def pascal(r): row = [1] for i in range(r): row = list([i+j for i,j in zip(row + [0], [0] + row)]) yield row for r in pascal(20): pass # In[2]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.bar(range(len(r)), r); # At the School of Tomorrow, we recommend immersion, as when learning a language, to pick up on correlation, regression, normal distribution, confidence intervals and so on. Absorb the semantics and make connections to some glue language like Python for specific workouts. # # The concept of a vector is especially important, given its embodiment as an almost literal tip-to-tail arrow pointing from the origin to anywhere in an n-D space. Such pointing, with corresponding labeling, is the bread and butter input of supervised machine learning algorithms. # # Up to 3-D we have the visualizable space of polyhedrons. # # ### Historical Sidebar # # In coordinated Martian Math segments, on polyhedrons, the School of Tomorrow may introduce quadrays, as a questioning and investigational tool ala Ludwig Wittgenstein. How many basis vectors do we need again? The famous three need their three opposites, rotated 180 degrees. "What minimum basis might get by without needing opposites?" # # We guess about this and that, whether this or that happened in the past, or has yet to happen. When making these guesses, we use existing data as evidence. A model that's scoring well is able to correctly predict what we already know to be the case. # # ### The Science of Predicting # # Under the heading of "prediction" therefore, comes "the ability to guess correctly" whether or not we're looking into the future or into the past. Keep in mind that Physics, including Quantum Physics, is just as interested in prediction, in "guessing with some confidance" as any discipline. # # A goal, in engineering, is to have some influence over outcomes, and that means looking for trimtabs. # # How might we optimize various distribution networks, such as the internet itself, so that it's less likely to bog down in traffic jams? # # ### Historical Sidebar # # "Data Science" is a relatively recent invention, for what used to be called Statistics. We still have Statistics, but ever since statistics joined forces with Machine Learning, the term "data science" has been in the foreground. The evolution of Machine Learning has been against the backdrop of some professional debates the statisticians have been having. One of these debates has been between so-called "Frequentists" and another camp known as "Bayesians". # # ### Research Project: Recent History of Data Science # # Looking for a research topic? Here's [a place to start](https://www.amazon.com/dp/B0050QB3EQ): *The Theory That Would Not Die: How Bayes' Rule Cracked the Enigma Code, Hunted Down Russian Submarines, and Emerged Triumphant from Two Centuries of Controversy* by Sharon Bertsch McGrayne. # In[3]: import json with open("glossary.json", 'r') as infile: # context manager syntax glossary = json.load(infile) glossary['HTTP'] # ### Programming Interlude: Context Managers # # Since we've chosen Python for a kernel language (many choices exist), we might as well dig into it from time to time. In the code cell above, you'll notice the keyword ```with``` with the optional ```as``` piece, with indented code underneath (as many lines as we like). # # The indented code is the body of our "context" which is entered at the top and exited at the bottom. The occassions of entering and exiting a context automatically trigger the ```__enter__``` and ```__exit__``` methods of the object we're using with ```with```. # In[4]: class Castle: """ Example of a class designed to perform as a context manager, as triggered by keyword 'with' """ def __init__(self, name): self.name = name def __enter__(self): return self # pass forward through as def inner_sanctum(self): # Monty Python allusion return "Holy Grail from %s" % self.name def __exit__(self, *oops): if oops[0]: # do cleanup pass return True with Castle("Goth Castle") as castle: knight_bag = castle.inner_sanctum() print("Content of knight_bag:", knight_bag) # In[5]: glossary["Bayesian"] = "inferential methods useable even in the absense of any prospect for controlled studies" glossary["Pharo"] = "a Smalltalk-like language and ecosystem that competes with Python's" glossary["Sphinx"] = "a documentation generator, targeting the web in particular, for use with Python" # #### Digital Mathematics curriculum # # In [Digital Mathematics: Heuristics for Teachers](http://wikieducator.org/Digital_Math) you will find a way of carving up our mathematical domain into four sections: # # * Martian Math (looking towards the future) # * Neolithic Math (looking towards the past) # * Casino Math (looking at risks) # * Supermarket Math (looking at ecological systems) # # Ready for [Part Two](dataviz2.ipynb)? # ## More Tools... # # In addition to Python the language, which is our Kernel, we need to use some add-on 3rd party packages, which usually get installed in a subfolder called ```site-packages``` and associated with the specific Python you're using. # # Three of these packages are: # # * ```numpy``` (a workhorse that works with tensors, or n-dimensional arrays) # * ```pandas``` for encapsulating [tensors](https://www.tensorflow.org/guide/tensors) and adding dictionary-like labeling # * ```matplotlib``` for doing the actual visualizations. # * ```seaborn``` for making matplotlib even prettier. # # ### Will I Ever Know Enough? # What you might be asking yourself, perhaps having glanced at some documentation, is: # # 1. where to begin? and # 2. will I really need to memorize hundreds of commands to control each one of these products? # # Our assumption here is you're involved in "world game" meaning thinking globally, acting locally. # # You're on the faculty of a think tank. People look to you for guidance. # # To get a stronger grasp on what's going on, you read a lot, but you also look at data that's sometimes too new to have yet led many, if any, to draw conclusions. You are one of those privileged data analysts with a special vantage point, who will share your sense of what it all means with your peers. # # That's partly why you read, and also write a lot: to keep your communication skills polished. We're learning new language our entire lives. New vocabularies. New "games" (language games), some of which are literally games. Learning from data also involves applying the techniques of data science, which may include using machine learning algorithms. # # The data you're studying is not necessarily "big data" although it may be. "Small data" may still be quite a lot, by 20th Century standards. # # #### Research Project: Apache Foundation # # [The Apache Foundation](https://www.apache.org/) helps fund a number of valuable free and open source products built to work with big data. In order to gain some fluency with the concepts, do some research on these projects. # # # ### What's an API? # As for memorization, you're best bet is to stay in the habit of consulting documentation, and deciphering it. What you're often looking for is advice on how to use an "API" or Application Programming Interface. You might call it a control panel or dashboard, but unless you're operating a GUI, the API is likely encountered in the thick of some programming language, such as Python, Ruby, or JavaScript. # # #### Reading the Docs # # Looking ahead to the next Notebook: # # * How do I sort a DataFrame by index? [Check here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_index.html) # * How do I sort a DataFrame by any column? [Check here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html) # # If you work with these tools on an everyday basis, then you'll become more adept through practice. However what's rewarding about programming is that your code will run extremely fast even if you take a relatively long time to write it, compared to someone else who writes code faster. # # Better to take your time and understand what you're doing, than just cut and paste a lot of code you find on the internet. It's fine to cut and paste code, but plan to spend time getting to understand it in some detail. That way, you'll continue along your learning curve. # # A common misapprehension about "learning to code" is that "real programming" always involves starting with a blank canvas and writing everything from scratch. Certainly piano players don't do that, when it comes to piano playing. Sometimes that's a good approach. Other times, you best bet is to begin with some existing code, and modifying it to suit your own purposes. # # Without further delay, lets get to know some of our data science tools, each with its own API. # In[6]: import numpy as np import pandas as pd import matplotlib.pyplot as plt # done above already import matplotlib as mpl import seaborn as sns from math import sin, cos, radians # lets plot some trig functions! # Notice that you don't need to import Python itself. That's because Python is the Kernel behind the scenes running all these code cells. One specifies the Kernel upon starting a new Jupyter Notebook. # In[7]: # Kernel is Python 3.6 or above print(f"""\ Numpy version : {np.__version__} Pandas version : {pd.__version__} Matplotlib version: {mpl.__version__} Seaborn version: : {sns.__version__}""" ) # You probably won't want or need to upgrade each time there's a version change. In fact sometimes you may find yourself in the opposite situation, of needing to lock in an old version of something. Programmers use containers and virtual environments to preserve old ecosystems and keep them from contaminating each other. # # When you do upgrade a package, you may find rerunning the same code results in warnings or outright errors. Packages with stable APIs are less likely to surprise you in this way. It's a good idea to consult documentation to find out what's new, if you actually have a choice about whether to upgrade or not. # In[8]: # if you have an earlier kernel print("""\ Numpy version : {} Pandas version : {} Matplotlib version: {} Seaborn version : {}""".format( np.__version__, pd.__version__, mpl.__version__, sns.__version__) ) # The code cell below is quite typical of how we might use ```plt``` (matplotlib.pyplot) together with ```np``` (numpy). Note that ```pd``` (pandas) is not yet involved. We'll be seeing it soon. # # The ```np.linspace``` command is one of the most used, as we so often need a particular number of evenly spaced numbers between a minimum and maximum extreme. ```np.arange``` is the other workhorse. It takes a minimum and maximum extreme, just like ```linspace```, however the third argument is the increment you wish to use. ```arange``` will figure out how many elements you need, up to but not including the limiting value. # # Note that both of these functions return ```np.ndarray``` objects, where the ```ndarray``` type is the star of ```numpy```. An ```ndarray``` is a multi-dimensional array, meaning it has one or more axes. These axes define the coordinate system structure used to address the contained elements. You'll learn more about the ins and outs of ```ndarrays``` from other notebooks. # In[9]: domain = np.linspace(-5, 5, 100) # give me 100 points from -5 to 5 y_sin = np.sin(domain) # do all 1000 y_cos = np.cos(domain) # do all 1000 def plot_functions(): plt.figure(figsize=(10, 5)) plt.xlabel("X") plt.ylabel("Y") plt.title("Trig Functions") lines = plt.plot(domain, y_sin, 'go', domain, y_cos, 'y^') # https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html leg = plt.legend(lines, ("sine", "cosine"), title="Key", frameon=True, shadow=True, facecolor="gray", borderaxespad=2) plt.axis([-6, 6, -1.5, 1.5]) plt.show() plot_functions() # ### Historical Sidebar # # Do you live in a dome home? Trigonometric functions prove useful when it comes to computing the vertexes of a geodesic sphere. # # One of the best primers on the topic is [Divided Spheres](http://www.dividedspheres.com/) by [Ed Popko](http://www.dividedspheres.com/?page_id=19). Dome homes became popular in the 1960s onward, as an alternative to the more conventional house. # #

# # The two videos below, talk about how we might (or might not) want to envision dome homes going forward. # In[10]: from IPython.display import YouTubeVideo YouTubeVideo("QV4m76Om7bk") # https://youtu.be/QV4m76Om7bk # In[11]: YouTubeVideo("rnkjVd1h8oE") # https://youtu.be/rnkjVd1h8oE # The [nbviewer view](https://nbviewer.jupyter.org/github/4dsolutions/School_of_Tomorrow/blob/master/dataviz.ipynb) of this notebook will render the Youtubes in place. Github does not. # ### Literary Sidebar # # Another author who ventured into the realm of geodesic dome design was [Hugh Kenner](https://en.wikipedia.org/wiki/Hugh_Kenner), better known for [The Pound Era](https://en.wikipedia.org/wiki/The_Pound_Era). # # He also wrote [Bucky](https://www.amazon.com/Bucky-Guided-Tour-Buckminster-Fuller/dp/0688001416) and [Geodesic Math and How to Use It](https://www.amazon.com/dp/0520239318). # #

# In[12]: def make_table(): keys = pd.Series( list(glossary.keys()), dtype=np.object) values = pd.Series( list(glossary.values()), dtype=np.object) df = pd.DataFrame({"term":keys, "definition":values}).set_index("term") # create and delete a sorting column, wherein the terms are all uppercase df["sort_column"] = df.index.str.upper() df.sort_values(['sort_column'], axis=0, ascending=True, inplace=True) del df["sort_column"] # now that the df is sorted, delete the sorting column return df # In[13]: # glossary is an ordinary Python dict, stored as JSON in a text file glossary["matplotlib"] = "data visualization package for Python, originally written by John D. Hunter" glossary["numpy"] = "number crunchy goodness, vectorizes computations on n-dimensional arrays" glossary["pandas"] = "wraps numpy arrays in handsome frames with row and column indexes" glossary["seaborn"] = "adds new powers to matplotlib, makes pretty plots" glossary["API"] = "a set of functions that take variable arguments, providing programmed control of something" glossary["Ruby"] = "a programming language somewhat like Python and Perl, invented by Yukihiro Matsumoto" glossary["ndarray"] = "n-dimensional array, the star of the numpy package, a multi-axis data structure" glossary["DataFrame"] = "the star of the pandas package, providing ndarrays with a framing infrastructure" # In[14]: glossary_df = make_table() # The Python function above has the job of taking our ```glossary``` object, a Python dictionary, and turning it into a pandas DataFrame object. The dict's keys should comprise our index of terms and be sorted in a case-insensitive manner. # In[15]: pd.set_option('display.max_colwidth', -1) # max width on columns please glossary_df # In[16]: glossary_df.to_json('glossary2.json') # ## Slicing a Pandas DataFrame # # We're free to pick out a range of rows based on starting and ending values. using the .loc method with square brackets. The .iloc method assumes a purely numeric index of consecutive integers, whether one is defined or not. # In[17]: glossary_df.iloc[3:10] # numeric indexing is from 0 and non-inclusive of the outer bound # In[18]: glossary_df.loc["HTML":"Kernel"] # In[19]: glossary_df.loc["Python":] # slice from Python to the end # ### A First Look at Seaborn # # [Seaborn](https://seaborn.pydata.org/introduction.html) # # The only change is ```sns.set()``` is run, prior to invoking the very same ```plot_functions```. # # Notice the cosmetic differences, procured for free. # In[20]: plot_functions() # In[21]: sns.set() # In[22]: plot_functions() # What does more advanced seaborn look like? [Click here](https://towardsdatascience.com/3-awesome-visualization-techniques-for-every-dataset-9737eecacbe8) for an example on *Medium*. # ### Anatomy of a pandas Series # # I am a Series, what are my parts? Am more than just a numpy array, but you could say I have a numpy array as payload. # # #### What does it eat? # # How might I be [initialized](https://pandas.pydata.org/pandas-docs/stable/reference/series.html#constructor)? Let's try me. # In[23]: from pandas import Series # In[24]: data = {'a':1, 'b':2, 'z':22} test1 = Series(data) # In[25]: test1 # OK, so a dictionary works. You could decompose (deconstruct) a dict into its values and keys, using the corresponding methods, and feed those in separately, with keys the index, but why bother? Still, it's nice to know that we can. # In[26]: test1a = Series(data=list(data.values()), index=data.keys()) test1a # Why was it necessary to feed ```data.values()``` to the list type, instead of just using it directly? # # Modify the code and see. # # The object returned by ```data.values()``` is interpreted as a single tuple to be repeated over and over, for each index row. Atom smash it with list( ) into component particles and you're set. # In[27]: from string import ascii_lowercase as letters test2 = Series(np.arange(10), index=list(letters)[:10], # just as many as needed name = "Labeled", dtype=np.int8) # In[28]: test2 # In[29]: payload = test2.values # extract the numpy array nutty goodness # In[30]: type(payload) # or tolist() if you wish a Python list # In[31]: payload # In[32]: def digitrange(minlen, maxlen, base=2): """Generator producing all lists of digits to a given base.""" digits = [0] * maxlen loop = True if minlen > 0: digits[minlen] = 1 while loop: yield tuple(reversed(digits)) digits[0] += 1 i = 0 while digits[i] >= base: if ((i+1) >= maxlen): loop = False break digits[i] = 0 digits[i+1] += 1 i += 1 # In[33]: gen = digitrange(0, 5, base=2) from collections import defaultdict tally = defaultdict(int) for p in gen: tally[p.count(1)] += 1 print(tally.values()) # In[34]: YouTubeVideo("WWv0RUxDfbs")