# conventional way to import pandas
import pandas as pd
# read a dataset of Chipotle orders directly from a URL and store the results in a DataFrame
orders = pd.read_table('http://bit.ly/chiporders')
# examine the first 5 rows
orders.head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | Chips and Fresh Tomato Salsa | NaN | $2.39 |
1 | 1 | 1 | Izze | [Clementine] | $3.39 |
2 | 1 | 1 | Nantucket Nectar | [Apple] | $3.39 |
3 | 1 | 1 | Chips and Tomatillo-Green Chili Salsa | NaN | $2.39 |
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
Documentation for read_table
# read a dataset of movie reviewers (modifying the default parameter values for read_table)
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('http://bit.ly/movieusers', sep='|', header=None, names=user_cols)
# examine the first 5 rows
users.head()
user_id | age | gender | occupation | zip_code | |
---|---|---|---|---|---|
0 | 1 | 24 | M | technician | 85711 |
1 | 2 | 53 | F | other | 94043 |
2 | 3 | 23 | M | writer | 32067 |
3 | 4 | 24 | M | technician | 43537 |
4 | 5 | 33 | F | other | 15213 |
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_table('http://bit.ly/uforeports', sep=',')
# read_csv is equivalent to read_table, except it assumes a comma separator
ufo = pd.read_csv('http://bit.ly/uforeports')
# examine the first 5 rows
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# select the 'City' Series using bracket notation
ufo['City']
# or equivalently, use dot notation
ufo.City
0 Ithaca 1 Willingboro 2 Holyoke 3 Abilene 4 New York Worlds Fair 5 Valley City 6 Crater Lake 7 Alma 8 Eklutna 9 Hubbard 10 Fontana 11 Waterloo 12 Belton 13 Keokuk 14 Ludington 15 Forest Home 16 Los Angeles 17 Hapeville 18 Oneida 19 Bering Sea 20 Nebraska 21 NaN 22 NaN 23 Owensboro 24 Wilderness 25 San Diego 26 Wilderness 27 Clovis 28 Los Alamos 29 Ft. Duschene ... 18211 Holyoke 18212 Carson 18213 Pasadena 18214 Austin 18215 El Campo 18216 Garden Grove 18217 Berthoud Pass 18218 Sisterdale 18219 Garden Grove 18220 Shasta Lake 18221 Franklin 18222 Albrightsville 18223 Greenville 18224 Eufaula 18225 Simi Valley 18226 San Francisco 18227 San Francisco 18228 Kingsville 18229 Chicago 18230 Pismo Beach 18231 Pismo Beach 18232 Lodi 18233 Anchorage 18234 Capitola 18235 Fountain Hills 18236 Grant Park 18237 Spirit Lake 18238 Eagle River 18239 Eagle River 18240 Ybor Name: City, dtype: object
Bracket notation will always work, whereas dot notation has limitations:
# create a new 'Location' Series (must use bracket notation to define the Series name)
ufo['Location'] = ufo.City + ', ' + ufo.State
ufo.head()
City | Colors Reported | Shape Reported | State | Time | Location | |
---|---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 | Ithaca, NY |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 | Willingboro, NJ |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 | Holyoke, CO |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 | Abilene, KS |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 | New York Worlds Fair, NY |
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
Methods end with parentheses, while attributes don't:
# example method: show the first 5 rows
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# example method: calculate summary statistics
movies.describe()
star_rating | duration | |
---|---|---|
count | 979.000000 | 979.000000 |
mean | 7.889785 | 120.979571 |
std | 0.336069 | 26.218010 |
min | 7.400000 | 64.000000 |
25% | 7.600000 | 102.000000 |
50% | 7.800000 | 117.000000 |
75% | 8.100000 | 134.000000 |
max | 9.300000 | 242.000000 |
# example attribute: number of rows and columns
movies.shape
(979, 6)
# example attribute: data type of each column
movies.dtypes
star_rating float64 title object content_rating object genre object duration int64 actors_list object dtype: object
# use an optional parameter to the describe method to summarize only 'object' columns
movies.describe(include=['object'])
title | content_rating | genre | actors_list | |
---|---|---|---|---|
count | 979 | 976 | 979 | 979 |
unique | 975 | 12 | 16 | 969 |
top | The Girl with the Dragon Tattoo | R | Drama | [u'Daniel Radcliffe', u'Emma Watson', u'Rupert... |
freq | 2 | 460 | 278 | 6 |
Documentation for describe
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
# examine the column names
ufo.columns
Index([u'City', u'Colors Reported', u'Shape Reported', u'State', u'Time'], dtype='object')
# rename two of the columns by using the 'rename' method
ufo.rename(columns={'Colors Reported':'Colors_Reported', 'Shape Reported':'Shape_Reported'}, inplace=True)
ufo.columns
Index([u'City', u'Colors_Reported', u'Shape_Reported', u'State', u'Time'], dtype='object')
Documentation for rename
# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']
ufo.columns = ufo_cols
ufo.columns
Index([u'city', u'colors reported', u'shape reported', u'state', u'time'], dtype='object')
# replace the column names during the file reading process by using the 'names' parameter
ufo = pd.read_csv('http://bit.ly/uforeports', header=0, names=ufo_cols)
ufo.columns
Index([u'city', u'colors reported', u'shape reported', u'state', u'time'], dtype='object')
Documentation for read_csv
# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns
Index([u'city', u'colors_reported', u'shape_reported', u'state', u'time'], dtype='object')
Documentation for str.replace
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# remove a single column (axis=1 refers to columns)
ufo.drop('Colors Reported', axis=1, inplace=True)
ufo.head()
City | Shape Reported | State | Time | |
---|---|---|---|---|
0 | Ithaca | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | LIGHT | NY | 4/18/1933 19:00 |
Documentation for drop
# remove multiple columns at once
ufo.drop(['City', 'State'], axis=1, inplace=True)
ufo.head()
Shape Reported | Time | |
---|---|---|
0 | TRIANGLE | 6/1/1930 22:00 |
1 | OTHER | 6/30/1930 20:00 |
2 | OVAL | 2/15/1931 14:00 |
3 | DISK | 6/1/1931 13:00 |
4 | LIGHT | 4/18/1933 19:00 |
# remove multiple rows at once (axis=0 refers to rows)
ufo.drop([0, 1], axis=0, inplace=True)
ufo.head()
Shape Reported | Time | |
---|---|---|
2 | OVAL | 2/15/1931 14:00 |
3 | DISK | 6/1/1931 13:00 |
4 | LIGHT | 4/18/1933 19:00 |
5 | DISK | 9/15/1934 15:30 |
6 | CIRCLE | 6/15/1935 0:00 |
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
Note: None of the sorting methods below affect the underlying data. (In other words, the sorting is temporary).
# sort the 'title' Series in ascending order (returns a Series)
movies.title.sort_values().head()
542 (500) Days of Summer 5 12 Angry Men 201 12 Years a Slave 698 127 Hours 110 2001: A Space Odyssey Name: title, dtype: object
# sort in descending order instead
movies.title.sort_values(ascending=False).head()
864 [Rec] 526 Zulu 615 Zombieland 677 Zodiac 955 Zero Dark Thirty Name: title, dtype: object
Documentation for sort_values
for a Series. (Prior to version 0.17, use order
instead.)
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort_values('title').head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
542 | 7.8 | (500) Days of Summer | PG-13 | Comedy | 95 | [u'Zooey Deschanel', u'Joseph Gordon-Levitt', ... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
201 | 8.1 | 12 Years a Slave | R | Biography | 134 | [u'Chiwetel Ejiofor', u'Michael Kenneth Willia... |
698 | 7.6 | 127 Hours | R | Adventure | 94 | [u'James Franco', u'Amber Tamblyn', u'Kate Mara'] |
110 | 8.3 | 2001: A Space Odyssey | G | Mystery | 160 | [u'Keir Dullea', u'Gary Lockwood', u'William S... |
# sort in descending order instead
movies.sort_values('title', ascending=False).head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
864 | 7.5 | [Rec] | R | Horror | 78 | [u'Manuela Velasco', u'Ferran Terraza', u'Jorg... |
526 | 7.8 | Zulu | UNRATED | Drama | 138 | [u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac... |
615 | 7.7 | Zombieland | R | Comedy | 88 | [u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha... |
677 | 7.7 | Zodiac | R | Crime | 157 | [u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M... |
955 | 7.4 | Zero Dark Thirty | R | Drama | 157 | [u'Jessica Chastain', u'Joel Edgerton', u'Chri... |
Documentation for sort_values
for a DataFrame. (Prior to version 0.17, use sort
instead.)
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort_values(['content_rating', 'duration']).head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
713 | 7.6 | The Jungle Book | APPROVED | Animation | 78 | [u'Phil Harris', u'Sebastian Cabot', u'Louis P... |
513 | 7.8 | Invasion of the Body Snatchers | APPROVED | Horror | 80 | [u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga... |
272 | 8.1 | The Killing | APPROVED | Crime | 85 | [u'Sterling Hayden', u'Coleen Gray', u'Vince E... |
703 | 7.6 | Dracula | APPROVED | Horror | 85 | [u'Bela Lugosi', u'Helen Chandler', u'David Ma... |
612 | 7.7 | A Hard Day's Night | APPROVED | Comedy | 87 | [u'John Lennon', u'Paul McCartney', u'George H... |
Summary of changes to the sorting API in pandas 0.17
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# examine the number of rows and columns
movies.shape
(979, 6)
Goal: Filter the DataFrame rows to only show movies with a 'duration' of at least 200 minutes.
# create a list in which each element refers to a DataFrame row: True if the row satisfies the condition, False otherwise
booleans = []
for length in movies.duration:
if length >= 200:
booleans.append(True)
else:
booleans.append(False)
# confirm that the list has the same length as the DataFrame
len(booleans)
979
# examine the first five list elements
booleans[0:5]
[False, False, True, False, False]
# convert the list to a Series
is_long = pd.Series(booleans)
is_long.head()
0 False 1 False 2 True 3 False 4 False dtype: bool
# use bracket notation with the boolean Series to tell the DataFrame which rows to display
movies[is_long]
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
78 | 8.4 | Once Upon a Time in America | R | Crime | 229 | [u'Robert De Niro', u'James Woods', u'Elizabet... |
85 | 8.4 | Lawrence of Arabia | PG | Adventure | 216 | [u"Peter O'Toole", u'Alec Guinness', u'Anthony... |
142 | 8.3 | Lagaan: Once Upon a Time in India | PG | Adventure | 224 | [u'Aamir Khan', u'Gracy Singh', u'Rachel Shell... |
157 | 8.2 | Gone with the Wind | G | Drama | 238 | [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... |
204 | 8.1 | Ben-Hur | G | Adventure | 212 | [u'Charlton Heston', u'Jack Hawkins', u'Stephe... |
445 | 7.9 | The Ten Commandments | APPROVED | Adventure | 220 | [u'Charlton Heston', u'Yul Brynner', u'Anne Ba... |
476 | 7.8 | Hamlet | PG-13 | Drama | 242 | [u'Kenneth Branagh', u'Julie Christie', u'Dere... |
630 | 7.7 | Malcolm X | PG-13 | Biography | 202 | [u'Denzel Washington', u'Angela Bassett', u'De... |
767 | 7.6 | It's a Mad, Mad, Mad, Mad World | APPROVED | Action | 205 | [u'Spencer Tracy', u'Milton Berle', u'Ethel Me... |
# simplify the steps above: no need to write a for loop to create 'is_long' since pandas will broadcast the comparison
is_long = movies.duration >= 200
movies[is_long]
# or equivalently, write it in one line (no need to create the 'is_long' object)
movies[movies.duration >= 200]
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
78 | 8.4 | Once Upon a Time in America | R | Crime | 229 | [u'Robert De Niro', u'James Woods', u'Elizabet... |
85 | 8.4 | Lawrence of Arabia | PG | Adventure | 216 | [u"Peter O'Toole", u'Alec Guinness', u'Anthony... |
142 | 8.3 | Lagaan: Once Upon a Time in India | PG | Adventure | 224 | [u'Aamir Khan', u'Gracy Singh', u'Rachel Shell... |
157 | 8.2 | Gone with the Wind | G | Drama | 238 | [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... |
204 | 8.1 | Ben-Hur | G | Adventure | 212 | [u'Charlton Heston', u'Jack Hawkins', u'Stephe... |
445 | 7.9 | The Ten Commandments | APPROVED | Adventure | 220 | [u'Charlton Heston', u'Yul Brynner', u'Anne Ba... |
476 | 7.8 | Hamlet | PG-13 | Drama | 242 | [u'Kenneth Branagh', u'Julie Christie', u'Dere... |
630 | 7.7 | Malcolm X | PG-13 | Biography | 202 | [u'Denzel Washington', u'Angela Bassett', u'De... |
767 | 7.6 | It's a Mad, Mad, Mad, Mad World | APPROVED | Action | 205 | [u'Spencer Tracy', u'Milton Berle', u'Ethel Me... |
# select the 'genre' Series from the filtered DataFrame
movies[movies.duration >= 200].genre
# or equivalently, use the 'loc' method
movies.loc[movies.duration >= 200, 'genre']
2 Crime 7 Adventure 17 Drama 78 Crime 85 Adventure 142 Adventure 157 Drama 204 Adventure 445 Adventure 476 Drama 630 Biography 767 Action Name: genre, dtype: object
Documentation for loc
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# filter the DataFrame to only show movies with a 'duration' of at least 200 minutes
movies[movies.duration >= 200]
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
78 | 8.4 | Once Upon a Time in America | R | Crime | 229 | [u'Robert De Niro', u'James Woods', u'Elizabet... |
85 | 8.4 | Lawrence of Arabia | PG | Adventure | 216 | [u"Peter O'Toole", u'Alec Guinness', u'Anthony... |
142 | 8.3 | Lagaan: Once Upon a Time in India | PG | Adventure | 224 | [u'Aamir Khan', u'Gracy Singh', u'Rachel Shell... |
157 | 8.2 | Gone with the Wind | G | Drama | 238 | [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... |
204 | 8.1 | Ben-Hur | G | Adventure | 212 | [u'Charlton Heston', u'Jack Hawkins', u'Stephe... |
445 | 7.9 | The Ten Commandments | APPROVED | Adventure | 220 | [u'Charlton Heston', u'Yul Brynner', u'Anne Ba... |
476 | 7.8 | Hamlet | PG-13 | Drama | 242 | [u'Kenneth Branagh', u'Julie Christie', u'Dere... |
630 | 7.7 | Malcolm X | PG-13 | Biography | 202 | [u'Denzel Washington', u'Angela Bassett', u'De... |
767 | 7.6 | It's a Mad, Mad, Mad, Mad World | APPROVED | Action | 205 | [u'Spencer Tracy', u'Milton Berle', u'Ethel Me... |
Understanding logical operators:
and
: True only if both sides of the operator are Trueor
: True if either side of the operator is True# demonstration of the 'and' operator
print(True and True)
print(True and False)
print(False and False)
True False False
# demonstration of the 'or' operator
print(True or True)
print(True or False)
print(False or False)
True True False
Rules for specifying multiple filter criteria in pandas:
&
instead of and
|
instead of or
Goal: Further filter the DataFrame of long movies (duration >= 200) to only show movies which also have a 'genre' of 'Drama'
# CORRECT: use the '&' operator to specify that both conditions are required
movies[(movies.duration >=200) & (movies.genre == 'Drama')]
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
157 | 8.2 | Gone with the Wind | G | Drama | 238 | [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... |
476 | 7.8 | Hamlet | PG-13 | Drama | 242 | [u'Kenneth Branagh', u'Julie Christie', u'Dere... |
# INCORRECT: using the '|' operator would have shown movies that are either long or dramas (or both)
movies[(movies.duration >=200) | (movies.genre == 'Drama')].head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
9 | 8.9 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... |
13 | 8.8 | Forrest Gump | PG-13 | Drama | 142 | [u'Tom Hanks', u'Robin Wright', u'Gary Sinise'] |
Goal: Filter the original DataFrame to show movies with a 'genre' of 'Crime' or 'Drama' or 'Action'
# use the '|' operator to specify that a row can match any of the three criteria
movies[(movies.genre == 'Crime') | (movies.genre == 'Drama') | (movies.genre == 'Action')].head(10)
# or equivalently, use the 'isin' method
movies[movies.genre.isin(['Crime', 'Drama', 'Action'])].head(10)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
9 | 8.9 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... |
11 | 8.8 | Inception | PG-13 | Action | 148 | [u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'... |
12 | 8.8 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 124 | [u'Mark Hamill', u'Harrison Ford', u'Carrie Fi... |
13 | 8.8 | Forrest Gump | PG-13 | Drama | 142 | [u'Tom Hanks', u'Robin Wright', u'Gary Sinise'] |
Documentation for isin
Question: When reading from a file, how do I read in only a subset of the columns?
# read a dataset of UFO reports into a DataFrame, and check the columns
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.columns
Index([u'City', u'Colors Reported', u'Shape Reported', u'State', u'Time'], dtype='object')
# specify which columns to include by name
ufo = pd.read_csv('http://bit.ly/uforeports', usecols=['City', 'State'])
# or equivalently, specify columns by position
ufo = pd.read_csv('http://bit.ly/uforeports', usecols=[0, 4])
ufo.columns
Index([u'City', u'Time'], dtype='object')
Question: When reading from a file, how do I read in only a subset of the rows?
# specify how many rows to read
ufo = pd.read_csv('http://bit.ly/uforeports', nrows=3)
ufo
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
Documentation for read_csv
Question: How do I iterate through a Series?
# Series are directly iterable (like a list)
for c in ufo.City:
print(c)
Ithaca Willingboro Holyoke
Question: How do I iterate through a DataFrame?
# various methods are available to iterate through a DataFrame
for index, row in ufo.iterrows():
print(index, row.City, row.State)
(0, 'Ithaca', 'NY') (1, 'Willingboro', 'NJ') (2, 'Holyoke', 'CO')
Documentation for iterrows
Question: How do I drop all non-numeric columns from a DataFrame?
# read a dataset of alcohol consumption into a DataFrame, and check the data types
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.dtypes
country object beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
# only include numeric columns in the DataFrame
import numpy as np
drinks.select_dtypes(include=[np.number]).dtypes
beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 dtype: object
Documentation for select_dtypes
Question: How do I know whether I should pass an argument as a string or a list?
# describe all of the numeric columns
drinks.describe()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | |
---|---|---|---|---|
count | 193.000000 | 193.000000 | 193.000000 | 193.000000 |
mean | 106.160622 | 80.994819 | 49.450777 | 4.717098 |
std | 101.143103 | 88.284312 | 79.697598 | 3.773298 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 20.000000 | 4.000000 | 1.000000 | 1.300000 |
50% | 76.000000 | 56.000000 | 8.000000 | 4.200000 |
75% | 188.000000 | 128.000000 | 59.000000 | 7.200000 |
max | 376.000000 | 438.000000 | 370.000000 | 14.400000 |
# pass the string 'all' to describe all columns
drinks.describe(include='all')
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
count | 193 | 193.000000 | 193.000000 | 193.000000 | 193.000000 | 193 |
unique | 193 | NaN | NaN | NaN | NaN | 6 |
top | Lesotho | NaN | NaN | NaN | NaN | Africa |
freq | 1 | NaN | NaN | NaN | NaN | 53 |
mean | NaN | 106.160622 | 80.994819 | 49.450777 | 4.717098 | NaN |
std | NaN | 101.143103 | 88.284312 | 79.697598 | 3.773298 | NaN |
min | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN |
25% | NaN | 20.000000 | 4.000000 | 1.000000 | 1.300000 | NaN |
50% | NaN | 76.000000 | 56.000000 | 8.000000 | 4.200000 | NaN |
75% | NaN | 188.000000 | 128.000000 | 59.000000 | 7.200000 | NaN |
max | NaN | 376.000000 | 438.000000 | 370.000000 | 14.400000 | NaN |
# pass a list of data types to only describe certain types
drinks.describe(include=['object', 'float64'])
country | total_litres_of_pure_alcohol | continent | |
---|---|---|---|
count | 193 | 193.000000 | 193 |
unique | 193 | NaN | 6 |
top | Lesotho | NaN | Africa |
freq | 1 | NaN | 53 |
mean | NaN | 4.717098 | NaN |
std | NaN | 3.773298 | NaN |
min | NaN | 0.000000 | NaN |
25% | NaN | 1.300000 | NaN |
50% | NaN | 4.200000 | NaN |
75% | NaN | 7.200000 | NaN |
max | NaN | 14.400000 | NaN |
# pass a list even if you only want to describe a single data type
drinks.describe(include=['object'])
country | continent | |
---|---|---|
count | 193 | 193 |
unique | 193 | 6 |
top | Lesotho | Africa |
freq | 1 | 53 |
Documentation for describe
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# drop a column (temporarily)
drinks.drop('continent', axis=1).head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | |
---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 |
1 | Albania | 89 | 132 | 54 | 4.9 |
2 | Algeria | 25 | 0 | 14 | 0.7 |
3 | Andorra | 245 | 138 | 312 | 12.4 |
4 | Angola | 217 | 57 | 45 | 5.9 |
Documentation for drop
# drop a row (temporarily)
drinks.drop(2, axis=0).head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
5 | Antigua & Barbuda | 102 | 128 | 45 | 4.9 | North America |
When referring to rows or columns with the axis parameter:
# calculate the mean of each numeric column
drinks.mean()
# or equivalently, specify the axis explicitly
drinks.mean(axis=0)
beer_servings 106.160622 spirit_servings 80.994819 wine_servings 49.450777 total_litres_of_pure_alcohol 4.717098 dtype: float64
Documentation for mean
# calculate the mean of each row
drinks.mean(axis=1).head()
0 0.000 1 69.975 2 9.925 3 176.850 4 81.225 dtype: float64
When performing a mathematical operation with the axis parameter:
# 'index' is an alias for axis 0
drinks.mean(axis='index')
beer_servings 106.160622 spirit_servings 80.994819 wine_servings 49.450777 total_litres_of_pure_alcohol 4.717098 dtype: float64
# 'columns' is an alias for axis 1
drinks.mean(axis='columns').head()
0 0.000 1 69.975 2 9.925 3 176.850 4 81.225 dtype: float64
# read a dataset of Chipotle orders into a DataFrame
orders = pd.read_table('http://bit.ly/chiporders')
orders.head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | Chips and Fresh Tomato Salsa | NaN | $2.39 |
1 | 1 | 1 | Izze | [Clementine] | $3.39 |
2 | 1 | 1 | Nantucket Nectar | [Apple] | $3.39 |
3 | 1 | 1 | Chips and Tomatillo-Green Chili Salsa | NaN | $2.39 |
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
# normal way to access string methods in Python
'hello'.upper()
'HELLO'
# string methods for pandas Series are accessed via 'str'
orders.item_name.str.upper().head()
0 CHIPS AND FRESH TOMATO SALSA 1 IZZE 2 NANTUCKET NECTAR 3 CHIPS AND TOMATILLO-GREEN CHILI SALSA 4 CHICKEN BOWL Name: item_name, dtype: object
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()
0 False 1 False 2 False 3 False 4 True Name: item_name, dtype: bool
# use the boolean Series to filter the DataFrame
orders[orders.item_name.str.contains('Chicken')].head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
5 | 3 | 1 | Chicken Bowl | [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou... | $10.98 |
11 | 6 | 1 | Chicken Crispy Tacos | [Roasted Chili Corn Salsa, [Fajita Vegetables,... | $8.75 |
12 | 6 | 1 | Chicken Soft Tacos | [Roasted Chili Corn Salsa, [Rice, Black Beans,... | $8.75 |
13 | 7 | 1 | Chicken Bowl | [Fresh Tomato Salsa, [Fajita Vegetables, Rice,... | $11.25 |
# string methods can be chained together
orders.choice_description.str.replace('[', '').str.replace(']', '').head()
0 NaN 1 Clementine 2 Apple 3 NaN 4 Tomatillo-Red Chili Salsa (Hot), Black Beans, ... Name: choice_description, dtype: object
# many pandas string methods support regular expressions (regex)
orders.choice_description.str.replace('[\[\]]', '').head()
0 NaN 1 Clementine 2 Apple 3 NaN 4 Tomatillo-Red Chili Salsa (Hot), Black Beans, ... Name: choice_description, dtype: object
String handling section of the pandas API reference
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# examine the data type of each Series
drinks.dtypes
country object beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
# change the data type of an existing Series
drinks['beer_servings'] = drinks.beer_servings.astype(float)
drinks.dtypes
country object beer_servings float64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
Documentation for astype
# alternatively, change the data type of a Series while reading in a file
drinks = pd.read_csv('http://bit.ly/drinksbycountry', dtype={'beer_servings':float})
drinks.dtypes
country object beer_servings float64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
# read a dataset of Chipotle orders into a DataFrame
orders = pd.read_table('http://bit.ly/chiporders')
orders.head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | Chips and Fresh Tomato Salsa | NaN | $2.39 |
1 | 1 | 1 | Izze | [Clementine] | $3.39 |
2 | 1 | 1 | Nantucket Nectar | [Apple] | $3.39 |
3 | 1 | 1 | Chips and Tomatillo-Green Chili Salsa | NaN | $2.39 |
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
# examine the data type of each Series
orders.dtypes
order_id int64 quantity int64 item_name object choice_description object item_price object dtype: object
# convert a string to a number in order to do math
orders.item_price.str.replace('$', '').astype(float).mean()
7.464335785374397
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()
0 False 1 False 2 False 3 False 4 True Name: item_name, dtype: bool
# convert a boolean Series to an integer (False = 0, True = 1)
orders.item_name.str.contains('Chicken').astype(int).head()
0 0 1 0 2 0 3 0 4 1 Name: item_name, dtype: int32
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# calculate the mean beer servings across the entire dataset
drinks.beer_servings.mean()
106.16062176165804
# calculate the mean beer servings just for countries in Africa
drinks[drinks.continent=='Africa'].beer_servings.mean()
61.471698113207545
# calculate the mean beer servings for each continent
drinks.groupby('continent').beer_servings.mean()
continent Africa 61.471698 Asia 37.045455 Europe 193.777778 North America 145.434783 Oceania 89.687500 South America 175.083333 Name: beer_servings, dtype: float64
Documentation for groupby
# other aggregation functions (such as 'max') can also be used with groupby
drinks.groupby('continent').beer_servings.max()
continent Africa 376 Asia 247 Europe 361 North America 285 Oceania 306 South America 333 Name: beer_servings, dtype: int64
# multiple aggregation functions can be applied simultaneously
drinks.groupby('continent').beer_servings.agg(['count', 'mean', 'min', 'max'])
count | mean | min | max | |
---|---|---|---|---|
continent | ||||
Africa | 53 | 61.471698 | 0 | 376 |
Asia | 44 | 37.045455 | 0 | 247 |
Europe | 45 | 193.777778 | 0 | 361 |
North America | 23 | 145.434783 | 1 | 285 |
Oceania | 16 | 89.687500 | 0 | 306 |
South America | 12 | 175.083333 | 93 | 333 |
Documentation for agg
# specifying a column to which the aggregation function should be applied is not required
drinks.groupby('continent').mean()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | |
---|---|---|---|---|
continent | ||||
Africa | 61.471698 | 16.339623 | 16.264151 | 3.007547 |
Asia | 37.045455 | 60.840909 | 9.068182 | 2.170455 |
Europe | 193.777778 | 132.555556 | 142.222222 | 8.617778 |
North America | 145.434783 | 165.739130 | 24.521739 | 5.995652 |
Oceania | 89.687500 | 58.437500 | 35.625000 | 3.381250 |
South America | 175.083333 | 114.750000 | 62.416667 | 6.308333 |
# allow plots to appear in the notebook
%matplotlib inline
# side-by-side bar plot of the DataFrame directly above
drinks.groupby('continent').mean().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x9a52cf8>
Documentation for plot
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# examine the data type of each Series
movies.dtypes
star_rating float64 title object content_rating object genre object duration int64 actors_list object dtype: object
Exploring a non-numeric Series:
# count the non-null values, unique values, and frequency of the most common value
movies.genre.describe()
count 979 unique 16 top Drama freq 278 Name: genre, dtype: object
Documentation for describe
# count how many times each value in the Series occurs
movies.genre.value_counts()
Drama 278 Comedy 156 Action 136 Crime 124 Biography 77 Adventure 75 Animation 62 Horror 29 Mystery 16 Western 9 Thriller 5 Sci-Fi 5 Film-Noir 3 Family 2 Fantasy 1 History 1 Name: genre, dtype: int64
Documentation for value_counts
# display percentages instead of raw counts
movies.genre.value_counts(normalize=True)
Drama 0.283963 Comedy 0.159346 Action 0.138917 Crime 0.126660 Biography 0.078652 Adventure 0.076609 Animation 0.063330 Horror 0.029622 Mystery 0.016343 Western 0.009193 Thriller 0.005107 Sci-Fi 0.005107 Film-Noir 0.003064 Family 0.002043 Fantasy 0.001021 History 0.001021 Name: genre, dtype: float64
# 'value_counts' (like many pandas methods) outputs a Series
type(movies.genre.value_counts())
pandas.core.series.Series
# thus, you can add another Series method on the end
movies.genre.value_counts().head()
Drama 278 Comedy 156 Action 136 Crime 124 Biography 77 Name: genre, dtype: int64
# display the unique values in the Series
movies.genre.unique()
array(['Crime', 'Action', 'Drama', 'Western', 'Adventure', 'Biography', 'Comedy', 'Animation', 'Mystery', 'Horror', 'Film-Noir', 'Sci-Fi', 'History', 'Thriller', 'Family', 'Fantasy'], dtype=object)
# count the number of unique values in the Series
movies.genre.nunique()
16
# compute a cross-tabulation of two Series
pd.crosstab(movies.genre, movies.content_rating)
content_rating | APPROVED | G | GP | NC-17 | NOT RATED | PASSED | PG | PG-13 | R | TV-MA | UNRATED | X |
---|---|---|---|---|---|---|---|---|---|---|---|---|
genre | ||||||||||||
Action | 3 | 1 | 1 | 0 | 4 | 1 | 11 | 44 | 67 | 0 | 3 | 0 |
Adventure | 3 | 2 | 0 | 0 | 5 | 1 | 21 | 23 | 17 | 0 | 2 | 0 |
Animation | 3 | 20 | 0 | 0 | 3 | 0 | 25 | 5 | 5 | 0 | 1 | 0 |
Biography | 1 | 2 | 1 | 0 | 1 | 0 | 6 | 29 | 36 | 0 | 0 | 0 |
Comedy | 9 | 2 | 1 | 1 | 16 | 3 | 23 | 23 | 73 | 0 | 4 | 1 |
Crime | 6 | 0 | 0 | 1 | 7 | 1 | 6 | 4 | 87 | 0 | 11 | 1 |
Drama | 12 | 3 | 0 | 4 | 24 | 1 | 25 | 55 | 143 | 1 | 9 | 1 |
Family | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
Fantasy | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
Film-Noir | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
History | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
Horror | 2 | 0 | 0 | 1 | 1 | 0 | 1 | 2 | 16 | 0 | 5 | 1 |
Mystery | 4 | 1 | 0 | 0 | 1 | 0 | 1 | 2 | 6 | 0 | 1 | 0 |
Sci-Fi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 |
Thriller | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 0 | 0 |
Western | 1 | 0 | 0 | 0 | 2 | 0 | 2 | 1 | 3 | 0 | 0 | 0 |
Documentation for crosstab
Exploring a numeric Series:
# calculate various summary statistics
movies.duration.describe()
count 979.000000 mean 120.979571 std 26.218010 min 64.000000 25% 102.000000 50% 117.000000 75% 134.000000 max 242.000000 Name: duration, dtype: float64
# many statistics are implemented as Series methods
movies.duration.mean()
120.97957099080695
Documentation for mean
# 'value_counts' is primarily useful for categorical data, not numerical data
movies.duration.value_counts().head()
112 23 113 22 102 20 101 20 129 19 Name: duration, dtype: int64
# allow plots to appear in the notebook
%matplotlib inline
# histogram of the 'duration' Series (shows the distribution of a numerical variable)
movies.duration.plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x9e95710>
# bar plot of the 'value_counts' for the 'genre' Series
movies.genre.value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0xa234208>
Documentation for plot
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | Grant Park | NaN | TRIANGLE | IL | 12/31/2000 23:00 |
18237 | Spirit Lake | NaN | DISK | IA | 12/31/2000 23:00 |
18238 | Eagle River | NaN | NaN | WI | 12/31/2000 23:45 |
18239 | Eagle River | RED | LIGHT | WI | 12/31/2000 23:45 |
18240 | Ybor | NaN | OVAL | FL | 12/31/2000 23:59 |
What does "NaN" mean?
numpy.nan
.read_csv
detects missing values (by default) when reading the file, and replaces them with this special value.Documentation for read_csv
# 'isnull' returns a DataFrame of booleans (True if missing, False if not missing)
ufo.isnull().tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | False | True | False | False | False |
18237 | False | True | False | False | False |
18238 | False | True | True | False | False |
18239 | False | False | False | False | False |
18240 | False | True | False | False | False |
# 'nonnull' returns the opposite of 'isnull' (True if not missing, False if missing)
ufo.notnull().tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | True | False | True | True | True |
18237 | True | False | True | True | True |
18238 | True | False | False | True | True |
18239 | True | True | True | True | True |
18240 | True | False | True | True | True |
# count the number of missing values in each Series
ufo.isnull().sum()
City 25 Colors Reported 15359 Shape Reported 2644 State 0 Time 0 dtype: int64
This calculation works because:
sum
method for a DataFrame operates on axis=0
by default (and thus produces column sums).True
to 1 and False
to 0.# use the 'isnull' Series method to filter the DataFrame rows
ufo[ufo.City.isnull()].head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
21 | NaN | NaN | NaN | LA | 8/15/1943 0:00 |
22 | NaN | NaN | LIGHT | LA | 8/15/1943 0:00 |
204 | NaN | NaN | DISK | CA | 7/15/1952 12:30 |
241 | NaN | BLUE | DISK | MT | 7/4/1953 14:00 |
613 | NaN | NaN | DISK | NV | 7/1/1960 12:00 |
How to handle missing values depends on the dataset as well as the nature of your analysis. Here are some options:
# examine the number of rows and columns
ufo.shape
(18241, 5)
# if 'any' values are missing in a row, then drop that row
ufo.dropna(how='any').shape
(2486, 5)
Documentation for dropna
# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily
ufo.shape
(18241, 5)
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.dropna(how='all').shape
(18241, 5)
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape
(15576, 5)
# if 'all' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=['City', 'Shape Reported'], how='all').shape
(18237, 5)
# 'value_counts' does not include missing values by default
ufo['Shape Reported'].value_counts().head()
LIGHT 2803 DISK 2122 TRIANGLE 1889 OTHER 1402 CIRCLE 1365 Name: Shape Reported, dtype: int64
# explicitly include missing values
ufo['Shape Reported'].value_counts(dropna=False).head()
LIGHT 2803 NaN 2644 DISK 2122 TRIANGLE 1889 OTHER 1402 Name: Shape Reported, dtype: int64
Documentation for value_counts
# fill in missing values with a specified value
ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)
Documentation for fillna
# confirm that the missing values were filled in
ufo['Shape Reported'].value_counts().head()
VARIOUS 2977 LIGHT 2803 DISK 2122 TRIANGLE 1889 OTHER 1402 Name: Shape Reported, dtype: int64
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# every DataFrame has an index (sometimes called the "row labels")
drinks.index
RangeIndex(start=0, stop=193, step=1)
# column names are also stored in a special "index" object
drinks.columns
Index([u'country', u'beer_servings', u'spirit_servings', u'wine_servings', u'total_litres_of_pure_alcohol', u'continent'], dtype='object')
# neither the index nor the columns are included in the shape
drinks.shape
(193, 6)
# index and columns both default to integers if you don't define them
pd.read_table('http://bit.ly/movieusers', header=None, sep='|').head()
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 1 | 24 | M | technician | 85711 |
1 | 2 | 53 | F | other | 94043 |
2 | 3 | 23 | M | writer | 32067 |
3 | 4 | 24 | M | technician | 43537 |
4 | 5 | 33 | F | other | 15213 |
What is the index used for?
# identification: index remains with each row when filtering the DataFrame
drinks[drinks.continent=='South America']
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
6 | Argentina | 193 | 25 | 221 | 8.3 | South America |
20 | Bolivia | 167 | 41 | 8 | 3.8 | South America |
23 | Brazil | 245 | 145 | 16 | 7.2 | South America |
35 | Chile | 130 | 124 | 172 | 7.6 | South America |
37 | Colombia | 159 | 76 | 3 | 4.2 | South America |
52 | Ecuador | 162 | 74 | 3 | 4.2 | South America |
72 | Guyana | 93 | 302 | 1 | 7.1 | South America |
132 | Paraguay | 213 | 117 | 74 | 7.3 | South America |
133 | Peru | 163 | 160 | 21 | 6.1 | South America |
163 | Suriname | 128 | 178 | 7 | 5.6 | South America |
185 | Uruguay | 115 | 35 | 220 | 6.6 | South America |
188 | Venezuela | 333 | 100 | 3 | 7.7 | South America |
# selection: select a portion of the DataFrame using the index
drinks.loc[23, 'beer_servings']
245
Documentation for loc
# set an existing column as the index
drinks.set_index('country', inplace=True)
drinks.head()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|
country | |||||
Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
Albania | 89 | 132 | 54 | 4.9 | Europe |
Algeria | 25 | 0 | 14 | 0.7 | Africa |
Andorra | 245 | 138 | 312 | 12.4 | Europe |
Angola | 217 | 57 | 45 | 5.9 | Africa |
Documentation for set_index
# 'country' is now the index
drinks.index
Index([u'Afghanistan', u'Albania', u'Algeria', u'Andorra', u'Angola', u'Antigua & Barbuda', u'Argentina', u'Armenia', u'Australia', u'Austria', ... u'Tanzania', u'USA', u'Uruguay', u'Uzbekistan', u'Vanuatu', u'Venezuela', u'Vietnam', u'Yemen', u'Zambia', u'Zimbabwe'], dtype='object', name=u'country', length=193)
# 'country' is no longer a column
drinks.columns
Index([u'beer_servings', u'spirit_servings', u'wine_servings', u'total_litres_of_pure_alcohol', u'continent'], dtype='object')
# 'country' data is no longer part of the DataFrame contents
drinks.shape
(193, 5)
# country name can now be used for selection
drinks.loc['Brazil', 'beer_servings']
245
# index name is optional
drinks.index.name = None
drinks.head()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|
Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
Albania | 89 | 132 | 54 | 4.9 | Europe |
Algeria | 25 | 0 | 14 | 0.7 | Africa |
Andorra | 245 | 138 | 312 | 12.4 | Europe |
Angola | 217 | 57 | 45 | 5.9 | Africa |
# restore the index name, and move the index back to a column
drinks.index.name = 'country'
drinks.reset_index(inplace=True)
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
Documentation for reset_index
# many DataFrame methods output a DataFrame
drinks.describe()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | |
---|---|---|---|---|
count | 193.000000 | 193.000000 | 193.000000 | 193.000000 |
mean | 106.160622 | 80.994819 | 49.450777 | 4.717098 |
std | 101.143103 | 88.284312 | 79.697598 | 3.773298 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 20.000000 | 4.000000 | 1.000000 | 1.300000 |
50% | 76.000000 | 56.000000 | 8.000000 | 4.200000 |
75% | 188.000000 | 128.000000 | 59.000000 | 7.200000 |
max | 376.000000 | 438.000000 | 370.000000 | 14.400000 |
# you can interact with any DataFrame using its index and columns
drinks.describe().loc['25%', 'beer_servings']
20.0
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# every DataFrame has an index
drinks.index
RangeIndex(start=0, stop=193, step=1)
# every Series also has an index (which carries over from the DataFrame)
drinks.continent.head()
0 Asia 1 Europe 2 Africa 3 Europe 4 Africa Name: continent, dtype: object
# set 'country' as the index
drinks.set_index('country', inplace=True)
Documentation for set_index
# Series index is on the left, values are on the right
drinks.continent.head()
country Afghanistan Asia Albania Europe Algeria Africa Andorra Europe Angola Africa Name: continent, dtype: object
# another example of a Series (output from the 'value_counts' method)
drinks.continent.value_counts()
Africa 53 Europe 45 Asia 44 North America 23 Oceania 16 South America 12 Name: continent, dtype: int64
Documentation for value_counts
# access the Series index
drinks.continent.value_counts().index
Index([u'Africa', u'Europe', u'Asia', u'North America', u'Oceania', u'South America'], dtype='object')
# access the Series values
drinks.continent.value_counts().values
array([53, 45, 44, 23, 16, 12], dtype=int64)
# elements in a Series can be selected by index (using bracket notation)
drinks.continent.value_counts()['Africa']
53
# any Series can be sorted by its values
drinks.continent.value_counts().sort_values()
South America 12 Oceania 16 North America 23 Asia 44 Europe 45 Africa 53 Name: continent, dtype: int64
# any Series can also be sorted by its index
drinks.continent.value_counts().sort_index()
Africa 53 Asia 44 Europe 45 North America 23 Oceania 16 South America 12 Name: continent, dtype: int64
Documentation for sort_values
and sort_index
What is the index used for?
# 'beer_servings' Series contains the average annual beer servings per person
drinks.beer_servings.head()
country Afghanistan 0 Albania 89 Algeria 25 Andorra 245 Angola 217 Name: beer_servings, dtype: int64
# create a Series containing the population of two countries
people = pd.Series([3000000, 85000], index=['Albania', 'Andorra'], name='population')
people
Albania 3000000 Andorra 85000 Name: population, dtype: int64
Documentation for Series
# calculate the total annual beer servings for each country
(drinks.beer_servings * people).head()
Afghanistan NaN Albania 267000000.0 Algeria NaN Andorra 20825000.0 Angola NaN dtype: float64
# concatenate the 'drinks' DataFrame with the 'population' Series (aligns by the index)
pd.concat([drinks, people], axis=1).head()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | population | |
---|---|---|---|---|---|---|
Afghanistan | 0 | 0 | 0 | 0.0 | Asia | NaN |
Albania | 89 | 132 | 54 | 4.9 | Europe | 3000000.0 |
Algeria | 25 | 0 | 14 | 0.7 | Africa | NaN |
Andorra | 245 | 138 | 312 | 12.4 | Europe | 85000.0 |
Angola | 217 | 57 | 45 | 5.9 | Africa | NaN |
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head(3)
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
The loc
method is used to select rows and columns by label. You can pass it:
# row 0, all columns
ufo.loc[0, :]
City Ithaca Colors Reported NaN Shape Reported TRIANGLE State NY Time 6/1/1930 22:00 Name: 0, dtype: object
# rows 0 and 1 and 2, all columns
ufo.loc[[0, 1, 2], :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
# rows 0 through 2 (inclusive), all columns
ufo.loc[0:2, :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
# this implies "all columns", but explicitly stating "all columns" is better
ufo.loc[0:2]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
# rows 0 through 2 (inclusive), column 'City'
ufo.loc[0:2, 'City']
0 Ithaca 1 Willingboro 2 Holyoke Name: City, dtype: object
# rows 0 through 2 (inclusive), columns 'City' and 'State'
ufo.loc[0:2, ['City', 'State']]
City | State | |
---|---|---|
0 | Ithaca | NY |
1 | Willingboro | NJ |
2 | Holyoke | CO |
# accomplish the same thing using double brackets - but using 'loc' is preferred since it's more explicit
ufo[['City', 'State']].head(3)
City | State | |
---|---|---|
0 | Ithaca | NY |
1 | Willingboro | NJ |
2 | Holyoke | CO |
# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
ufo.loc[0:2, 'City':'State']
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY |
1 | Willingboro | NaN | OTHER | NJ |
2 | Holyoke | NaN | OVAL | CO |
# accomplish the same thing using 'head' and 'drop'
ufo.head(3).drop('Time', axis=1)
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY |
1 | Willingboro | NaN | OTHER | NJ |
2 | Holyoke | NaN | OVAL | CO |
# rows in which the 'City' is 'Oakland', column 'State'
ufo.loc[ufo.City=='Oakland', 'State']
1694 CA 2144 CA 4686 MD 7293 CA 8488 CA 8768 CA 10816 OR 10948 CA 11045 CA 12322 CA 12941 CA 16803 MD 17322 CA Name: State, dtype: object
# accomplish the same thing using "chained indexing" - but using 'loc' is preferred since chained indexing can cause problems
ufo[ufo.City=='Oakland'].State
1694 CA 2144 CA 4686 MD 7293 CA 8488 CA 8768 CA 10816 OR 10948 CA 11045 CA 12322 CA 12941 CA 16803 MD 17322 CA Name: State, dtype: object
The iloc
method is used to select rows and columns by integer position. You can pass it:
# rows in positions 0 and 1, columns in positions 0 and 3
ufo.iloc[[0, 1], [0, 3]]
City | State | |
---|---|---|
0 | Ithaca | NY |
1 | Willingboro | NJ |
# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
ufo.iloc[0:2, 0:4]
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY |
1 | Willingboro | NaN | OTHER | NJ |
# rows in positions 0 through 2 (exclusive), all columns
ufo.iloc[0:2, :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
# accomplish the same thing - but using 'iloc' is preferred since it's more explicit
ufo[0:2]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
The ix
method is used to select rows and columns by label or integer position, and should only be used when you need to mix label-based and integer-based selection in the same call.
# read a dataset of alcohol consumption into a DataFrame and set 'country' as the index
drinks = pd.read_csv('http://bit.ly/drinksbycountry', index_col='country')
drinks.head()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|
country | |||||
Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
Albania | 89 | 132 | 54 | 4.9 | Europe |
Algeria | 25 | 0 | 14 | 0.7 | Africa |
Andorra | 245 | 138 | 312 | 12.4 | Europe |
Angola | 217 | 57 | 45 | 5.9 | Africa |
# row with label 'Albania', column in position 0
drinks.ix['Albania', 0]
89
# row in position 1, column with label 'beer_servings'
drinks.ix[1, 'beer_servings']
89
Rules for using numbers with ix
:
# rows 'Albania' through 'Andorra' (inclusive), columns in positions 0 through 2 (exclusive)
drinks.ix['Albania':'Andorra', 0:2]
beer_servings | spirit_servings | |
---|---|---|
country | ||
Albania | 89 | 132 |
Algeria | 25 | 0 |
Andorra | 245 | 138 |
# rows 0 through 2 (inclusive), columns in positions 0 through 2 (exclusive)
ufo.ix[0:2, 0:2]
City | Colors Reported | |
---|---|---|
0 | Ithaca | NaN |
1 | Willingboro | NaN |
2 | Holyoke | NaN |
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
ufo.shape
(18241, 5)
# remove the 'City' column (doesn't affect the DataFrame since inplace=False)
ufo.drop('City', axis=1).head()
Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|
0 | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | NaN | DISK | KS | 6/1/1931 13:00 |
4 | NaN | LIGHT | NY | 4/18/1933 19:00 |
# confirm that the 'City' column was not actually removed
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# remove the 'City' column (does affect the DataFrame since inplace=True)
ufo.drop('City', axis=1, inplace=True)
# confirm that the 'City' column was actually removed
ufo.head()
Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|
0 | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | NaN | DISK | KS | 6/1/1931 13:00 |
4 | NaN | LIGHT | NY | 4/18/1933 19:00 |
# drop a row if any value is missing from that row (doesn't affect the DataFrame since inplace=False)
ufo.dropna(how='any').shape
(2490, 4)
# confirm that no rows were actually removed
ufo.shape
(18241, 4)
# use an assignment statement instead of the 'inplace' parameter
ufo = ufo.set_index('Time')
ufo.tail()
Colors Reported | Shape Reported | State | |
---|---|---|---|
Time | |||
12/31/2000 23:00 | NaN | TRIANGLE | IL |
12/31/2000 23:00 | NaN | DISK | IA |
12/31/2000 23:45 | NaN | NaN | WI |
12/31/2000 23:45 | RED | LIGHT | WI |
12/31/2000 23:59 | NaN | OVAL | FL |
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method='bfill').tail()
Colors Reported | Shape Reported | State | |
---|---|---|---|
Time | |||
12/31/2000 23:00 | RED | TRIANGLE | IL |
12/31/2000 23:00 | RED | DISK | IA |
12/31/2000 23:45 | RED | LIGHT | WI |
12/31/2000 23:45 | RED | LIGHT | WI |
12/31/2000 23:59 | NaN | OVAL | FL |
# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method='ffill').tail()
Colors Reported | Shape Reported | State | |
---|---|---|---|
Time | |||
12/31/2000 23:00 | RED | TRIANGLE | IL |
12/31/2000 23:00 | RED | DISK | IA |
12/31/2000 23:45 | RED | DISK | WI |
12/31/2000 23:45 | RED | LIGHT | WI |
12/31/2000 23:59 | RED | OVAL | FL |
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# exact memory usage is unknown because object columns are references elsewhere
drinks.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 193 entries, 0 to 192 Data columns (total 6 columns): country 193 non-null object beer_servings 193 non-null int64 spirit_servings 193 non-null int64 wine_servings 193 non-null int64 total_litres_of_pure_alcohol 193 non-null float64 continent 193 non-null object dtypes: float64(1), int64(3), object(2) memory usage: 9.1+ KB
# force pandas to calculate the true memory usage
drinks.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 193 entries, 0 to 192 Data columns (total 6 columns): country 193 non-null object beer_servings 193 non-null int64 spirit_servings 193 non-null int64 wine_servings 193 non-null int64 total_litres_of_pure_alcohol 193 non-null float64 continent 193 non-null object dtypes: float64(1), int64(3), object(2) memory usage: 24.4 KB
# calculate the memory usage for each Series (in bytes)
drinks.memory_usage(deep=True)
Index 72 country 9500 beer_servings 1544 spirit_servings 1544 wine_servings 1544 total_litres_of_pure_alcohol 1544 continent 9244 dtype: int64
Documentation for info
and memory_usage
# use the 'category' data type (new in pandas 0.15) to store the 'continent' strings as integers
drinks['continent'] = drinks.continent.astype('category')
drinks.dtypes
country object beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent category dtype: object
# 'continent' Series appears to be unchanged
drinks.continent.head()
0 Asia 1 Europe 2 Africa 3 Europe 4 Africa Name: continent, dtype: category Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]
# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks.continent.cat.codes.head()
0 1 1 2 2 0 3 2 4 0 dtype: int8
# memory usage has been drastically reduced
drinks.memory_usage(deep=True)<