# conventional way to import pandas
import pandas as pd
# read a dataset of Chipotle orders directly from a URL and store the results in a DataFrame
orders = pd.read_table('http://bit.ly/chiporders')
# examine the first 5 rows
orders.head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | Chips and Fresh Tomato Salsa | NaN | $2.39 |
1 | 1 | 1 | Izze | [Clementine] | $3.39 |
2 | 1 | 1 | Nantucket Nectar | [Apple] | $3.39 |
3 | 1 | 1 | Chips and Tomatillo-Green Chili Salsa | NaN | $2.39 |
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
Documentation for read_table
# read a dataset of movie reviewers (modifying the default parameter values for read_table)
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('http://bit.ly/movieusers', sep='|', header=None, names=user_cols)
# examine the first 5 rows
users.head()
user_id | age | gender | occupation | zip_code | |
---|---|---|---|---|---|
0 | 1 | 24 | M | technician | 85711 |
1 | 2 | 53 | F | other | 94043 |
2 | 3 | 23 | M | writer | 32067 |
3 | 4 | 24 | M | technician | 43537 |
4 | 5 | 33 | F | other | 15213 |
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_table('http://bit.ly/uforeports', sep=',')
# read_csv is equivalent to read_table, except it assumes a comma separator
ufo = pd.read_csv('http://bit.ly/uforeports')
# examine the first 5 rows
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# select the 'City' Series using bracket notation
ufo['City']
# or equivalently, use dot notation
ufo.City
0 Ithaca 1 Willingboro 2 Holyoke 3 Abilene 4 New York Worlds Fair 5 Valley City 6 Crater Lake 7 Alma 8 Eklutna 9 Hubbard 10 Fontana 11 Waterloo 12 Belton 13 Keokuk 14 Ludington 15 Forest Home 16 Los Angeles 17 Hapeville 18 Oneida 19 Bering Sea 20 Nebraska 21 NaN 22 NaN 23 Owensboro 24 Wilderness 25 San Diego 26 Wilderness 27 Clovis 28 Los Alamos 29 Ft. Duschene ... 18211 Holyoke 18212 Carson 18213 Pasadena 18214 Austin 18215 El Campo 18216 Garden Grove 18217 Berthoud Pass 18218 Sisterdale 18219 Garden Grove 18220 Shasta Lake 18221 Franklin 18222 Albrightsville 18223 Greenville 18224 Eufaula 18225 Simi Valley 18226 San Francisco 18227 San Francisco 18228 Kingsville 18229 Chicago 18230 Pismo Beach 18231 Pismo Beach 18232 Lodi 18233 Anchorage 18234 Capitola 18235 Fountain Hills 18236 Grant Park 18237 Spirit Lake 18238 Eagle River 18239 Eagle River 18240 Ybor Name: City, dtype: object
Bracket notation will always work, whereas dot notation has limitations:
# create a new 'Location' Series (must use bracket notation to define the Series name)
ufo['Location'] = ufo.City + ', ' + ufo.State
ufo.head()
City | Colors Reported | Shape Reported | State | Time | Location | |
---|---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 | Ithaca, NY |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 | Willingboro, NJ |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 | Holyoke, CO |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 | Abilene, KS |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 | New York Worlds Fair, NY |
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
Methods end with parentheses, while attributes don't:
# example method: show the first 5 rows
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# example method: calculate summary statistics
movies.describe()
star_rating | duration | |
---|---|---|
count | 979.000000 | 979.000000 |
mean | 7.889785 | 120.979571 |
std | 0.336069 | 26.218010 |
min | 7.400000 | 64.000000 |
25% | 7.600000 | 102.000000 |
50% | 7.800000 | 117.000000 |
75% | 8.100000 | 134.000000 |
max | 9.300000 | 242.000000 |
# example attribute: number of rows and columns
movies.shape
(979, 6)
# example attribute: data type of each column
movies.dtypes
star_rating float64 title object content_rating object genre object duration int64 actors_list object dtype: object
# use an optional parameter to the describe method to summarize only 'object' columns
movies.describe(include=['object'])
title | content_rating | genre | actors_list | |
---|---|---|---|---|
count | 979 | 976 | 979 | 979 |
unique | 975 | 12 | 16 | 969 |
top | The Girl with the Dragon Tattoo | R | Drama | [u'Daniel Radcliffe', u'Emma Watson', u'Rupert... |
freq | 2 | 460 | 278 | 6 |
Documentation for describe
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
# examine the column names
ufo.columns
Index([u'City', u'Colors Reported', u'Shape Reported', u'State', u'Time'], dtype='object')
# rename two of the columns by using the 'rename' method
ufo.rename(columns={'Colors Reported':'Colors_Reported', 'Shape Reported':'Shape_Reported'}, inplace=True)
ufo.columns
Index([u'City', u'Colors_Reported', u'Shape_Reported', u'State', u'Time'], dtype='object')
Documentation for rename
# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']
ufo.columns = ufo_cols
ufo.columns
Index([u'city', u'colors reported', u'shape reported', u'state', u'time'], dtype='object')
# replace the column names during the file reading process by using the 'names' parameter
ufo = pd.read_csv('http://bit.ly/uforeports', header=0, names=ufo_cols)
ufo.columns
Index([u'city', u'colors reported', u'shape reported', u'state', u'time'], dtype='object')
Documentation for read_csv
# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns
Index([u'city', u'colors_reported', u'shape_reported', u'state', u'time'], dtype='object')
Documentation for str.replace
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# remove a single column (axis=1 refers to columns)
ufo.drop('Colors Reported', axis=1, inplace=True)
ufo.head()
City | Shape Reported | State | Time | |
---|---|---|---|---|
0 | Ithaca | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | LIGHT | NY | 4/18/1933 19:00 |
Documentation for drop
# remove multiple columns at once
ufo.drop(['City', 'State'], axis=1, inplace=True)
ufo.head()
Shape Reported | Time | |
---|---|---|
0 | TRIANGLE | 6/1/1930 22:00 |
1 | OTHER | 6/30/1930 20:00 |
2 | OVAL | 2/15/1931 14:00 |
3 | DISK | 6/1/1931 13:00 |
4 | LIGHT | 4/18/1933 19:00 |
# remove multiple rows at once (axis=0 refers to rows)
ufo.drop([0, 1], axis=0, inplace=True)
ufo.head()
Shape Reported | Time | |
---|---|---|
2 | OVAL | 2/15/1931 14:00 |
3 | DISK | 6/1/1931 13:00 |
4 | LIGHT | 4/18/1933 19:00 |
5 | DISK | 9/15/1934 15:30 |
6 | CIRCLE | 6/15/1935 0:00 |
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
Note: None of the sorting methods below affect the underlying data. (In other words, the sorting is temporary).
# sort the 'title' Series in ascending order (returns a Series)
movies.title.sort_values().head()
542 (500) Days of Summer 5 12 Angry Men 201 12 Years a Slave 698 127 Hours 110 2001: A Space Odyssey Name: title, dtype: object
# sort in descending order instead
movies.title.sort_values(ascending=False).head()
864 [Rec] 526 Zulu 615 Zombieland 677 Zodiac 955 Zero Dark Thirty Name: title, dtype: object
Documentation for sort_values
for a Series. (Prior to version 0.17, use order
instead.)
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort_values('title').head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
542 | 7.8 | (500) Days of Summer | PG-13 | Comedy | 95 | [u'Zooey Deschanel', u'Joseph Gordon-Levitt', ... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
201 | 8.1 | 12 Years a Slave | R | Biography | 134 | [u'Chiwetel Ejiofor', u'Michael Kenneth Willia... |
698 | 7.6 | 127 Hours | R | Adventure | 94 | [u'James Franco', u'Amber Tamblyn', u'Kate Mara'] |
110 | 8.3 | 2001: A Space Odyssey | G | Mystery | 160 | [u'Keir Dullea', u'Gary Lockwood', u'William S... |
# sort in descending order instead
movies.sort_values('title', ascending=False).head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
864 | 7.5 | [Rec] | R | Horror | 78 | [u'Manuela Velasco', u'Ferran Terraza', u'Jorg... |
526 | 7.8 | Zulu | UNRATED | Drama | 138 | [u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac... |
615 | 7.7 | Zombieland | R | Comedy | 88 | [u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha... |
677 | 7.7 | Zodiac | R | Crime | 157 | [u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M... |
955 | 7.4 | Zero Dark Thirty | R | Drama | 157 | [u'Jessica Chastain', u'Joel Edgerton', u'Chri... |
Documentation for sort_values
for a DataFrame. (Prior to version 0.17, use sort
instead.)
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort_values(['content_rating', 'duration']).head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
713 | 7.6 | The Jungle Book | APPROVED | Animation | 78 | [u'Phil Harris', u'Sebastian Cabot', u'Louis P... |
513 | 7.8 | Invasion of the Body Snatchers | APPROVED | Horror | 80 | [u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga... |
272 | 8.1 | The Killing | APPROVED | Crime | 85 | [u'Sterling Hayden', u'Coleen Gray', u'Vince E... |
703 | 7.6 | Dracula | APPROVED | Horror | 85 | [u'Bela Lugosi', u'Helen Chandler', u'David Ma... |
612 | 7.7 | A Hard Day's Night | APPROVED | Comedy | 87 | [u'John Lennon', u'Paul McCartney', u'George H... |
Summary of changes to the sorting API in pandas 0.17
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# examine the number of rows and columns
movies.shape
(979, 6)
Goal: Filter the DataFrame rows to only show movies with a 'duration' of at least 200 minutes.
# create a list in which each element refers to a DataFrame row: True if the row satisfies the condition, False otherwise
booleans = []
for length in movies.duration:
if length >= 200:
booleans.append(True)
else:
booleans.append(False)
# confirm that the list has the same length as the DataFrame
len(booleans)
979
# examine the first five list elements
booleans[0:5]
[False, False, True, False, False]
# convert the list to a Series
is_long = pd.Series(booleans)
is_long.head()
0 False 1 False 2 True 3 False 4 False dtype: bool
# use bracket notation with the boolean Series to tell the DataFrame which rows to display
movies[is_long]
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
78 | 8.4 | Once Upon a Time in America | R | Crime | 229 | [u'Robert De Niro', u'James Woods', u'Elizabet... |
85 | 8.4 | Lawrence of Arabia | PG | Adventure | 216 | [u"Peter O'Toole", u'Alec Guinness', u'Anthony... |
142 | 8.3 | Lagaan: Once Upon a Time in India | PG | Adventure | 224 | [u'Aamir Khan', u'Gracy Singh', u'Rachel Shell... |
157 | 8.2 | Gone with the Wind | G | Drama | 238 | [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... |
204 | 8.1 | Ben-Hur | G | Adventure | 212 | [u'Charlton Heston', u'Jack Hawkins', u'Stephe... |
445 | 7.9 | The Ten Commandments | APPROVED | Adventure | 220 | [u'Charlton Heston', u'Yul Brynner', u'Anne Ba... |
476 | 7.8 | Hamlet | PG-13 | Drama | 242 | [u'Kenneth Branagh', u'Julie Christie', u'Dere... |
630 | 7.7 | Malcolm X | PG-13 | Biography | 202 | [u'Denzel Washington', u'Angela Bassett', u'De... |
767 | 7.6 | It's a Mad, Mad, Mad, Mad World | APPROVED | Action | 205 | [u'Spencer Tracy', u'Milton Berle', u'Ethel Me... |
# simplify the steps above: no need to write a for loop to create 'is_long' since pandas will broadcast the comparison
is_long = movies.duration >= 200
movies[is_long]
# or equivalently, write it in one line (no need to create the 'is_long' object)
movies[movies.duration >= 200]
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
78 | 8.4 | Once Upon a Time in America | R | Crime | 229 | [u'Robert De Niro', u'James Woods', u'Elizabet... |
85 | 8.4 | Lawrence of Arabia | PG | Adventure | 216 | [u"Peter O'Toole", u'Alec Guinness', u'Anthony... |
142 | 8.3 | Lagaan: Once Upon a Time in India | PG | Adventure | 224 | [u'Aamir Khan', u'Gracy Singh', u'Rachel Shell... |
157 | 8.2 | Gone with the Wind | G | Drama | 238 | [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... |
204 | 8.1 | Ben-Hur | G | Adventure | 212 | [u'Charlton Heston', u'Jack Hawkins', u'Stephe... |
445 | 7.9 | The Ten Commandments | APPROVED | Adventure | 220 | [u'Charlton Heston', u'Yul Brynner', u'Anne Ba... |
476 | 7.8 | Hamlet | PG-13 | Drama | 242 | [u'Kenneth Branagh', u'Julie Christie', u'Dere... |
630 | 7.7 | Malcolm X | PG-13 | Biography | 202 | [u'Denzel Washington', u'Angela Bassett', u'De... |
767 | 7.6 | It's a Mad, Mad, Mad, Mad World | APPROVED | Action | 205 | [u'Spencer Tracy', u'Milton Berle', u'Ethel Me... |
# select the 'genre' Series from the filtered DataFrame
movies[movies.duration >= 200].genre
# or equivalently, use the 'loc' method
movies.loc[movies.duration >= 200, 'genre']
2 Crime 7 Adventure 17 Drama 78 Crime 85 Adventure 142 Adventure 157 Drama 204 Adventure 445 Adventure 476 Drama 630 Biography 767 Action Name: genre, dtype: object
Documentation for loc
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# filter the DataFrame to only show movies with a 'duration' of at least 200 minutes
movies[movies.duration >= 200]
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
78 | 8.4 | Once Upon a Time in America | R | Crime | 229 | [u'Robert De Niro', u'James Woods', u'Elizabet... |
85 | 8.4 | Lawrence of Arabia | PG | Adventure | 216 | [u"Peter O'Toole", u'Alec Guinness', u'Anthony... |
142 | 8.3 | Lagaan: Once Upon a Time in India | PG | Adventure | 224 | [u'Aamir Khan', u'Gracy Singh', u'Rachel Shell... |
157 | 8.2 | Gone with the Wind | G | Drama | 238 | [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... |
204 | 8.1 | Ben-Hur | G | Adventure | 212 | [u'Charlton Heston', u'Jack Hawkins', u'Stephe... |
445 | 7.9 | The Ten Commandments | APPROVED | Adventure | 220 | [u'Charlton Heston', u'Yul Brynner', u'Anne Ba... |
476 | 7.8 | Hamlet | PG-13 | Drama | 242 | [u'Kenneth Branagh', u'Julie Christie', u'Dere... |
630 | 7.7 | Malcolm X | PG-13 | Biography | 202 | [u'Denzel Washington', u'Angela Bassett', u'De... |
767 | 7.6 | It's a Mad, Mad, Mad, Mad World | APPROVED | Action | 205 | [u'Spencer Tracy', u'Milton Berle', u'Ethel Me... |
Understanding logical operators:
and
: True only if both sides of the operator are Trueor
: True if either side of the operator is True# demonstration of the 'and' operator
print(True and True)
print(True and False)
print(False and False)
True False False
# demonstration of the 'or' operator
print(True or True)
print(True or False)
print(False or False)
True True False
Rules for specifying multiple filter criteria in pandas:
&
instead of and
|
instead of or
Goal: Further filter the DataFrame of long movies (duration >= 200) to only show movies which also have a 'genre' of 'Drama'
# CORRECT: use the '&' operator to specify that both conditions are required
movies[(movies.duration >=200) & (movies.genre == 'Drama')]
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
157 | 8.2 | Gone with the Wind | G | Drama | 238 | [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... |
476 | 7.8 | Hamlet | PG-13 | Drama | 242 | [u'Kenneth Branagh', u'Julie Christie', u'Dere... |
# INCORRECT: using the '|' operator would have shown movies that are either long or dramas (or both)
movies[(movies.duration >=200) | (movies.genre == 'Drama')].head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
9 | 8.9 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... |
13 | 8.8 | Forrest Gump | PG-13 | Drama | 142 | [u'Tom Hanks', u'Robin Wright', u'Gary Sinise'] |
Goal: Filter the original DataFrame to show movies with a 'genre' of 'Crime' or 'Drama' or 'Action'
# use the '|' operator to specify that a row can match any of the three criteria
movies[(movies.genre == 'Crime') | (movies.genre == 'Drama') | (movies.genre == 'Action')].head(10)
# or equivalently, use the 'isin' method
movies[movies.genre.isin(['Crime', 'Drama', 'Action'])].head(10)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
9 | 8.9 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... |
11 | 8.8 | Inception | PG-13 | Action | 148 | [u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'... |
12 | 8.8 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 124 | [u'Mark Hamill', u'Harrison Ford', u'Carrie Fi... |
13 | 8.8 | Forrest Gump | PG-13 | Drama | 142 | [u'Tom Hanks', u'Robin Wright', u'Gary Sinise'] |
Documentation for isin
Question: When reading from a file, how do I read in only a subset of the columns?
# read a dataset of UFO reports into a DataFrame, and check the columns
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.columns
Index([u'City', u'Colors Reported', u'Shape Reported', u'State', u'Time'], dtype='object')
# specify which columns to include by name
ufo = pd.read_csv('http://bit.ly/uforeports', usecols=['City', 'State'])
# or equivalently, specify columns by position
ufo = pd.read_csv('http://bit.ly/uforeports', usecols=[0, 4])
ufo.columns
Index([u'City', u'Time'], dtype='object')
Question: When reading from a file, how do I read in only a subset of the rows?
# specify how many rows to read
ufo = pd.read_csv('http://bit.ly/uforeports', nrows=3)
ufo
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
Documentation for read_csv
Question: How do I iterate through a Series?
# Series are directly iterable (like a list)
for c in ufo.City:
print(c)
Ithaca Willingboro Holyoke
Question: How do I iterate through a DataFrame?
# various methods are available to iterate through a DataFrame
for index, row in ufo.iterrows():
print(index, row.City, row.State)
(0, 'Ithaca', 'NY') (1, 'Willingboro', 'NJ') (2, 'Holyoke', 'CO')
Documentation for iterrows
Question: How do I drop all non-numeric columns from a DataFrame?
# read a dataset of alcohol consumption into a DataFrame, and check the data types
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.dtypes
country object beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
# only include numeric columns in the DataFrame
import numpy as np
drinks.select_dtypes(include=[np.number]).dtypes
beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 dtype: object
Documentation for select_dtypes
Question: How do I know whether I should pass an argument as a string or a list?
# describe all of the numeric columns
drinks.describe()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | |
---|---|---|---|---|
count | 193.000000 | 193.000000 | 193.000000 | 193.000000 |
mean | 106.160622 | 80.994819 | 49.450777 | 4.717098 |
std | 101.143103 | 88.284312 | 79.697598 | 3.773298 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 20.000000 | 4.000000 | 1.000000 | 1.300000 |
50% | 76.000000 | 56.000000 | 8.000000 | 4.200000 |
75% | 188.000000 | 128.000000 | 59.000000 | 7.200000 |
max | 376.000000 | 438.000000 | 370.000000 | 14.400000 |
# pass the string 'all' to describe all columns
drinks.describe(include='all')
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
count | 193 | 193.000000 | 193.000000 | 193.000000 | 193.000000 | 193 |
unique | 193 | NaN | NaN | NaN | NaN | 6 |
top | Lesotho | NaN | NaN | NaN | NaN | Africa |
freq | 1 | NaN | NaN | NaN | NaN | 53 |
mean | NaN | 106.160622 | 80.994819 | 49.450777 | 4.717098 | NaN |
std | NaN | 101.143103 | 88.284312 | 79.697598 | 3.773298 | NaN |
min | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN |
25% | NaN | 20.000000 | 4.000000 | 1.000000 | 1.300000 | NaN |
50% | NaN | 76.000000 | 56.000000 | 8.000000 | 4.200000 | NaN |
75% | NaN | 188.000000 | 128.000000 | 59.000000 | 7.200000 | NaN |
max | NaN | 376.000000 | 438.000000 | 370.000000 | 14.400000 | NaN |
# pass a list of data types to only describe certain types
drinks.describe(include=['object', 'float64'])
country | total_litres_of_pure_alcohol | continent | |
---|---|---|---|
count | 193 | 193.000000 | 193 |
unique | 193 | NaN | 6 |
top | Lesotho | NaN | Africa |
freq | 1 | NaN | 53 |
mean | NaN | 4.717098 | NaN |
std | NaN | 3.773298 | NaN |
min | NaN | 0.000000 | NaN |
25% | NaN | 1.300000 | NaN |
50% | NaN | 4.200000 | NaN |
75% | NaN | 7.200000 | NaN |
max | NaN | 14.400000 | NaN |
# pass a list even if you only want to describe a single data type
drinks.describe(include=['object'])
country | continent | |
---|---|---|
count | 193 | 193 |
unique | 193 | 6 |
top | Lesotho | Africa |
freq | 1 | 53 |
Documentation for describe
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# drop a column (temporarily)
drinks.drop('continent', axis=1).head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | |
---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 |
1 | Albania | 89 | 132 | 54 | 4.9 |
2 | Algeria | 25 | 0 | 14 | 0.7 |
3 | Andorra | 245 | 138 | 312 | 12.4 |
4 | Angola | 217 | 57 | 45 | 5.9 |
Documentation for drop
# drop a row (temporarily)
drinks.drop(2, axis=0).head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
5 | Antigua & Barbuda | 102 | 128 | 45 | 4.9 | North America |
When referring to rows or columns with the axis parameter:
# calculate the mean of each numeric column
drinks.mean()
# or equivalently, specify the axis explicitly
drinks.mean(axis=0)
beer_servings 106.160622 spirit_servings 80.994819 wine_servings 49.450777 total_litres_of_pure_alcohol 4.717098 dtype: float64
Documentation for mean
# calculate the mean of each row
drinks.mean(axis=1).head()
0 0.000 1 69.975 2 9.925 3 176.850 4 81.225 dtype: float64
When performing a mathematical operation with the axis parameter:
# 'index' is an alias for axis 0
drinks.mean(axis='index')
beer_servings 106.160622 spirit_servings 80.994819 wine_servings 49.450777 total_litres_of_pure_alcohol 4.717098 dtype: float64
# 'columns' is an alias for axis 1
drinks.mean(axis='columns').head()
0 0.000 1 69.975 2 9.925 3 176.850 4 81.225 dtype: float64
# read a dataset of Chipotle orders into a DataFrame
orders = pd.read_table('http://bit.ly/chiporders')
orders.head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | Chips and Fresh Tomato Salsa | NaN | $2.39 |
1 | 1 | 1 | Izze | [Clementine] | $3.39 |
2 | 1 | 1 | Nantucket Nectar | [Apple] | $3.39 |
3 | 1 | 1 | Chips and Tomatillo-Green Chili Salsa | NaN | $2.39 |
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
# normal way to access string methods in Python
'hello'.upper()
'HELLO'
# string methods for pandas Series are accessed via 'str'
orders.item_name.str.upper().head()
0 CHIPS AND FRESH TOMATO SALSA 1 IZZE 2 NANTUCKET NECTAR 3 CHIPS AND TOMATILLO-GREEN CHILI SALSA 4 CHICKEN BOWL Name: item_name, dtype: object
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()
0 False 1 False 2 False 3 False 4 True Name: item_name, dtype: bool
# use the boolean Series to filter the DataFrame
orders[orders.item_name.str.contains('Chicken')].head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
5 | 3 | 1 | Chicken Bowl | [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou... | $10.98 |
11 | 6 | 1 | Chicken Crispy Tacos | [Roasted Chili Corn Salsa, [Fajita Vegetables,... | $8.75 |
12 | 6 | 1 | Chicken Soft Tacos | [Roasted Chili Corn Salsa, [Rice, Black Beans,... | $8.75 |
13 | 7 | 1 | Chicken Bowl | [Fresh Tomato Salsa, [Fajita Vegetables, Rice,... | $11.25 |
# string methods can be chained together
orders.choice_description.str.replace('[', '').str.replace(']', '').head()
0 NaN 1 Clementine 2 Apple 3 NaN 4 Tomatillo-Red Chili Salsa (Hot), Black Beans, ... Name: choice_description, dtype: object
# many pandas string methods support regular expressions (regex)
orders.choice_description.str.replace('[\[\]]', '').head()
0 NaN 1 Clementine 2 Apple 3 NaN 4 Tomatillo-Red Chili Salsa (Hot), Black Beans, ... Name: choice_description, dtype: object
String handling section of the pandas API reference
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# examine the data type of each Series
drinks.dtypes
country object beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
# change the data type of an existing Series
drinks['beer_servings'] = drinks.beer_servings.astype(float)
drinks.dtypes
country object beer_servings float64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
Documentation for astype
# alternatively, change the data type of a Series while reading in a file
drinks = pd.read_csv('http://bit.ly/drinksbycountry', dtype={'beer_servings':float})
drinks.dtypes
country object beer_servings float64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
# read a dataset of Chipotle orders into a DataFrame
orders = pd.read_table('http://bit.ly/chiporders')
orders.head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | Chips and Fresh Tomato Salsa | NaN | $2.39 |
1 | 1 | 1 | Izze | [Clementine] | $3.39 |
2 | 1 | 1 | Nantucket Nectar | [Apple] | $3.39 |
3 | 1 | 1 | Chips and Tomatillo-Green Chili Salsa | NaN | $2.39 |
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
# examine the data type of each Series
orders.dtypes
order_id int64 quantity int64 item_name object choice_description object item_price object dtype: object
# convert a string to a number in order to do math
orders.item_price.str.replace('$', '').astype(float).mean()
7.464335785374397
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()
0 False 1 False 2 False 3 False 4 True Name: item_name, dtype: bool
# convert a boolean Series to an integer (False = 0, True = 1)
orders.item_name.str.contains('Chicken').astype(int).head()
0 0 1 0 2 0 3 0 4 1 Name: item_name, dtype: int32
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# calculate the mean beer servings across the entire dataset
drinks.beer_servings.mean()
106.16062176165804
# calculate the mean beer servings just for countries in Africa
drinks[drinks.continent=='Africa'].beer_servings.mean()
61.471698113207545
# calculate the mean beer servings for each continent
drinks.groupby('continent').beer_servings.mean()
continent Africa 61.471698 Asia 37.045455 Europe 193.777778 North America 145.434783 Oceania 89.687500 South America 175.083333 Name: beer_servings, dtype: float64
Documentation for groupby
# other aggregation functions (such as 'max') can also be used with groupby
drinks.groupby('continent').beer_servings.max()
continent Africa 376 Asia 247 Europe 361 North America 285 Oceania 306 South America 333 Name: beer_servings, dtype: int64
# multiple aggregation functions can be applied simultaneously
drinks.groupby('continent').beer_servings.agg(['count', 'mean', 'min', 'max'])
count | mean | min | max | |
---|---|---|---|---|
continent | ||||
Africa | 53 | 61.471698 | 0 | 376 |
Asia | 44 | 37.045455 | 0 | 247 |
Europe | 45 | 193.777778 | 0 | 361 |
North America | 23 | 145.434783 | 1 | 285 |
Oceania | 16 | 89.687500 | 0 | 306 |
South America | 12 | 175.083333 | 93 | 333 |
Documentation for agg
# specifying a column to which the aggregation function should be applied is not required
drinks.groupby('continent').mean()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | |
---|---|---|---|---|
continent | ||||
Africa | 61.471698 | 16.339623 | 16.264151 | 3.007547 |
Asia | 37.045455 | 60.840909 | 9.068182 | 2.170455 |
Europe | 193.777778 | 132.555556 | 142.222222 | 8.617778 |
North America | 145.434783 | 165.739130 | 24.521739 | 5.995652 |
Oceania | 89.687500 | 58.437500 | 35.625000 | 3.381250 |
South America | 175.083333 | 114.750000 | 62.416667 | 6.308333 |
# allow plots to appear in the notebook
%matplotlib inline
# side-by-side bar plot of the DataFrame directly above
drinks.groupby('continent').mean().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x9a52cf8>
Documentation for plot
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# examine the data type of each Series
movies.dtypes
star_rating float64 title object content_rating object genre object duration int64 actors_list object dtype: object
Exploring a non-numeric Series:
# count the non-null values, unique values, and frequency of the most common value
movies.genre.describe()
count 979 unique 16 top Drama freq 278 Name: genre, dtype: object
Documentation for describe
# count how many times each value in the Series occurs
movies.genre.value_counts()
Drama 278 Comedy 156 Action 136 Crime 124 Biography 77 Adventure 75 Animation 62 Horror 29 Mystery 16 Western 9 Thriller 5 Sci-Fi 5 Film-Noir 3 Family 2 Fantasy 1 History 1 Name: genre, dtype: int64
Documentation for value_counts
# display percentages instead of raw counts
movies.genre.value_counts(normalize=True)
Drama 0.283963 Comedy 0.159346 Action 0.138917 Crime 0.126660 Biography 0.078652 Adventure 0.076609 Animation 0.063330 Horror 0.029622 Mystery 0.016343 Western 0.009193 Thriller 0.005107 Sci-Fi 0.005107 Film-Noir 0.003064 Family 0.002043 Fantasy 0.001021 History 0.001021 Name: genre, dtype: float64
# 'value_counts' (like many pandas methods) outputs a Series
type(movies.genre.value_counts())
pandas.core.series.Series
# thus, you can add another Series method on the end
movies.genre.value_counts().head()
Drama 278 Comedy 156 Action 136 Crime 124 Biography 77 Name: genre, dtype: int64
# display the unique values in the Series
movies.genre.unique()
array(['Crime', 'Action', 'Drama', 'Western', 'Adventure', 'Biography', 'Comedy', 'Animation', 'Mystery', 'Horror', 'Film-Noir', 'Sci-Fi', 'History', 'Thriller', 'Family', 'Fantasy'], dtype=object)
# count the number of unique values in the Series
movies.genre.nunique()
16
# compute a cross-tabulation of two Series
pd.crosstab(movies.genre, movies.content_rating)
content_rating | APPROVED | G | GP | NC-17 | NOT RATED | PASSED | PG | PG-13 | R | TV-MA | UNRATED | X |
---|---|---|---|---|---|---|---|---|---|---|---|---|
genre | ||||||||||||
Action | 3 | 1 | 1 | 0 | 4 | 1 | 11 | 44 | 67 | 0 | 3 | 0 |
Adventure | 3 | 2 | 0 | 0 | 5 | 1 | 21 | 23 | 17 | 0 | 2 | 0 |
Animation | 3 | 20 | 0 | 0 | 3 | 0 | 25 | 5 | 5 | 0 | 1 | 0 |
Biography | 1 | 2 | 1 | 0 | 1 | 0 | 6 | 29 | 36 | 0 | 0 | 0 |
Comedy | 9 | 2 | 1 | 1 | 16 | 3 | 23 | 23 | 73 | 0 | 4 | 1 |
Crime | 6 | 0 | 0 | 1 | 7 | 1 | 6 | 4 | 87 | 0 | 11 | 1 |
Drama | 12 | 3 | 0 | 4 | 24 | 1 | 25 | 55 | 143 | 1 | 9 | 1 |
Family | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
Fantasy | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
Film-Noir | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
History | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
Horror | 2 | 0 | 0 | 1 | 1 | 0 | 1 | 2 | 16 | 0 | 5 | 1 |
Mystery | 4 | 1 | 0 | 0 | 1 | 0 | 1 | 2 | 6 | 0 | 1 | 0 |
Sci-Fi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 |
Thriller | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 0 | 0 |
Western | 1 | 0 | 0 | 0 | 2 | 0 | 2 | 1 | 3 | 0 | 0 | 0 |
Documentation for crosstab
Exploring a numeric Series:
# calculate various summary statistics
movies.duration.describe()
count 979.000000 mean 120.979571 std 26.218010 min 64.000000 25% 102.000000 50% 117.000000 75% 134.000000 max 242.000000 Name: duration, dtype: float64
# many statistics are implemented as Series methods
movies.duration.mean()
120.97957099080695
Documentation for mean
# 'value_counts' is primarily useful for categorical data, not numerical data
movies.duration.value_counts().head()
112 23 113 22 102 20 101 20 129 19 Name: duration, dtype: int64
# allow plots to appear in the notebook
%matplotlib inline
# histogram of the 'duration' Series (shows the distribution of a numerical variable)
movies.duration.plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x9e95710>
# bar plot of the 'value_counts' for the 'genre' Series
movies.genre.value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0xa234208>
Documentation for plot
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | Grant Park | NaN | TRIANGLE | IL | 12/31/2000 23:00 |
18237 | Spirit Lake | NaN | DISK | IA | 12/31/2000 23:00 |
18238 | Eagle River | NaN | NaN | WI | 12/31/2000 23:45 |
18239 | Eagle River | RED | LIGHT | WI | 12/31/2000 23:45 |
18240 | Ybor | NaN | OVAL | FL | 12/31/2000 23:59 |
What does "NaN" mean?
numpy.nan
.read_csv
detects missing values (by default) when reading the file, and replaces them with this special value.Documentation for read_csv
# 'isnull' returns a DataFrame of booleans (True if missing, False if not missing)
ufo.isnull().tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | False | True | False | False | False |
18237 | False | True | False | False | False |
18238 | False | True | True | False | False |
18239 | False | False | False | False | False |
18240 | False | True | False | False | False |
# 'nonnull' returns the opposite of 'isnull' (True if not missing, False if missing)
ufo.notnull().tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | True | False | True | True | True |
18237 | True | False | True | True | True |
18238 | True | False | False | True | True |
18239 | True | True | True | True | True |
18240 | True | False | True | True | True |
# count the number of missing values in each Series
ufo.isnull().sum()
City 25 Colors Reported 15359 Shape Reported 2644 State 0 Time 0 dtype: int64
This calculation works because:
sum
method for a DataFrame operates on axis=0
by default (and thus produces column sums).True
to 1 and False
to 0.# use the 'isnull' Series method to filter the DataFrame rows
ufo[ufo.City.isnull()].head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
21 | NaN | NaN | NaN | LA | 8/15/1943 0:00 |
22 | NaN | NaN | LIGHT | LA | 8/15/1943 0:00 |
204 | NaN | NaN | DISK | CA | 7/15/1952 12:30 |
241 | NaN | BLUE | DISK | MT | 7/4/1953 14:00 |
613 | NaN | NaN | DISK | NV | 7/1/1960 12:00 |
How to handle missing values depends on the dataset as well as the nature of your analysis. Here are some options:
# examine the number of rows and columns
ufo.shape
(18241, 5)
# if 'any' values are missing in a row, then drop that row
ufo.dropna(how='any').shape
(2486, 5)
Documentation for dropna
# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily
ufo.shape
(18241, 5)
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.dropna(how='all').shape
(18241, 5)
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape
(15576, 5)
# if 'all' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=['City', 'Shape Reported'], how='all').shape
(18237, 5)
# 'value_counts' does not include missing values by default
ufo['Shape Reported'].value_counts().head()
LIGHT 2803 DISK 2122 TRIANGLE 1889 OTHER 1402 CIRCLE 1365 Name: Shape Reported, dtype: int64
# explicitly include missing values
ufo['Shape Reported'].value_counts(dropna=False).head()
LIGHT 2803 NaN 2644 DISK 2122 TRIANGLE 1889 OTHER 1402 Name: Shape Reported, dtype: int64
Documentation for value_counts
# fill in missing values with a specified value
ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)
Documentation for fillna
# confirm that the missing values were filled in
ufo['Shape Reported'].value_counts().head()
VARIOUS 2977 LIGHT 2803 DISK 2122 TRIANGLE 1889 OTHER 1402 Name: Shape Reported, dtype: int64
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# every DataFrame has an index (sometimes called the "row labels")
drinks.index
RangeIndex(start=0, stop=193, step=1)
# column names are also stored in a special "index" object
drinks.columns
Index([u'country', u'beer_servings', u'spirit_servings', u'wine_servings', u'total_litres_of_pure_alcohol', u'continent'], dtype='object')
# neither the index nor the columns are included in the shape
drinks.shape
(193, 6)
# index and columns both default to integers if you don't define them
pd.read_table('http://bit.ly/movieusers', header=None, sep='|').head()
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 1 | 24 | M | technician | 85711 |
1 | 2 | 53 | F | other | 94043 |
2 | 3 | 23 | M | writer | 32067 |
3 | 4 | 24 | M | technician | 43537 |
4 | 5 | 33 | F | other | 15213 |
What is the index used for?
# identification: index remains with each row when filtering the DataFrame
drinks[drinks.continent=='South America']
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
6 | Argentina | 193 | 25 | 221 | 8.3 | South America |
20 | Bolivia | 167 | 41 | 8 | 3.8 | South America |
23 | Brazil | 245 | 145 | 16 | 7.2 | South America |
35 | Chile | 130 | 124 | 172 | 7.6 | South America |
37 | Colombia | 159 | 76 | 3 | 4.2 | South America |
52 | Ecuador | 162 | 74 | 3 | 4.2 | South America |
72 | Guyana | 93 | 302 | 1 | 7.1 | South America |
132 | Paraguay | 213 | 117 | 74 | 7.3 | South America |
133 | Peru | 163 | 160 | 21 | 6.1 | South America |
163 | Suriname | 128 | 178 | 7 | 5.6 | South America |
185 | Uruguay | 115 | 35 | 220 | 6.6 | South America |
188 | Venezuela | 333 | 100 | 3 | 7.7 | South America |
# selection: select a portion of the DataFrame using the index
drinks.loc[23, 'beer_servings']
245
Documentation for loc
# set an existing column as the index
drinks.set_index('country', inplace=True)
drinks.head()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|
country | |||||
Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
Albania | 89 | 132 | 54 | 4.9 | Europe |
Algeria | 25 | 0 | 14 | 0.7 | Africa |
Andorra | 245 | 138 | 312 | 12.4 | Europe |
Angola | 217 | 57 | 45 | 5.9 | Africa |
Documentation for set_index
# 'country' is now the index
drinks.index
Index([u'Afghanistan', u'Albania', u'Algeria', u'Andorra', u'Angola', u'Antigua & Barbuda', u'Argentina', u'Armenia', u'Australia', u'Austria', ... u'Tanzania', u'USA', u'Uruguay', u'Uzbekistan', u'Vanuatu', u'Venezuela', u'Vietnam', u'Yemen', u'Zambia', u'Zimbabwe'], dtype='object', name=u'country', length=193)
# 'country' is no longer a column
drinks.columns
Index([u'beer_servings', u'spirit_servings', u'wine_servings', u'total_litres_of_pure_alcohol', u'continent'], dtype='object')
# 'country' data is no longer part of the DataFrame contents
drinks.shape
(193, 5)
# country name can now be used for selection
drinks.loc['Brazil', 'beer_servings']
245
# index name is optional
drinks.index.name = None
drinks.head()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|
Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
Albania | 89 | 132 | 54 | 4.9 | Europe |
Algeria | 25 | 0 | 14 | 0.7 | Africa |
Andorra | 245 | 138 | 312 | 12.4 | Europe |
Angola | 217 | 57 | 45 | 5.9 | Africa |
# restore the index name, and move the index back to a column
drinks.index.name = 'country'
drinks.reset_index(inplace=True)
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
Documentation for reset_index
# many DataFrame methods output a DataFrame
drinks.describe()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | |
---|---|---|---|---|
count | 193.000000 | 193.000000 | 193.000000 | 193.000000 |
mean | 106.160622 | 80.994819 | 49.450777 | 4.717098 |
std | 101.143103 | 88.284312 | 79.697598 | 3.773298 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 20.000000 | 4.000000 | 1.000000 | 1.300000 |
50% | 76.000000 | 56.000000 | 8.000000 | 4.200000 |
75% | 188.000000 | 128.000000 | 59.000000 | 7.200000 |
max | 376.000000 | 438.000000 | 370.000000 | 14.400000 |
# you can interact with any DataFrame using its index and columns
drinks.describe().loc['25%', 'beer_servings']
20.0
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# every DataFrame has an index
drinks.index
RangeIndex(start=0, stop=193, step=1)
# every Series also has an index (which carries over from the DataFrame)
drinks.continent.head()
0 Asia 1 Europe 2 Africa 3 Europe 4 Africa Name: continent, dtype: object
# set 'country' as the index
drinks.set_index('country', inplace=True)
Documentation for set_index
# Series index is on the left, values are on the right
drinks.continent.head()
country Afghanistan Asia Albania Europe Algeria Africa Andorra Europe Angola Africa Name: continent, dtype: object
# another example of a Series (output from the 'value_counts' method)
drinks.continent.value_counts()
Africa 53 Europe 45 Asia 44 North America 23 Oceania 16 South America 12 Name: continent, dtype: int64
Documentation for value_counts
# access the Series index
drinks.continent.value_counts().index
Index([u'Africa', u'Europe', u'Asia', u'North America', u'Oceania', u'South America'], dtype='object')
# access the Series values
drinks.continent.value_counts().values
array([53, 45, 44, 23, 16, 12], dtype=int64)
# elements in a Series can be selected by index (using bracket notation)
drinks.continent.value_counts()['Africa']
53
# any Series can be sorted by its values
drinks.continent.value_counts().sort_values()
South America 12 Oceania 16 North America 23 Asia 44 Europe 45 Africa 53 Name: continent, dtype: int64
# any Series can also be sorted by its index
drinks.continent.value_counts().sort_index()
Africa 53 Asia 44 Europe 45 North America 23 Oceania 16 South America 12 Name: continent, dtype: int64
Documentation for sort_values
and sort_index
What is the index used for?
# 'beer_servings' Series contains the average annual beer servings per person
drinks.beer_servings.head()
country Afghanistan 0 Albania 89 Algeria 25 Andorra 245 Angola 217 Name: beer_servings, dtype: int64
# create a Series containing the population of two countries
people = pd.Series([3000000, 85000], index=['Albania', 'Andorra'], name='population')
people
Albania 3000000 Andorra 85000 Name: population, dtype: int64
Documentation for Series
# calculate the total annual beer servings for each country
(drinks.beer_servings * people).head()
Afghanistan NaN Albania 267000000.0 Algeria NaN Andorra 20825000.0 Angola NaN dtype: float64
# concatenate the 'drinks' DataFrame with the 'population' Series (aligns by the index)
pd.concat([drinks, people], axis=1).head()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | population | |
---|---|---|---|---|---|---|
Afghanistan | 0 | 0 | 0 | 0.0 | Asia | NaN |
Albania | 89 | 132 | 54 | 4.9 | Europe | 3000000.0 |
Algeria | 25 | 0 | 14 | 0.7 | Africa | NaN |
Andorra | 245 | 138 | 312 | 12.4 | Europe | 85000.0 |
Angola | 217 | 57 | 45 | 5.9 | Africa | NaN |
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head(3)
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
The loc
method is used to select rows and columns by label. You can pass it:
# row 0, all columns
ufo.loc[0, :]
City Ithaca Colors Reported NaN Shape Reported TRIANGLE State NY Time 6/1/1930 22:00 Name: 0, dtype: object
# rows 0 and 1 and 2, all columns
ufo.loc[[0, 1, 2], :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
# rows 0 through 2 (inclusive), all columns
ufo.loc[0:2, :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
# this implies "all columns", but explicitly stating "all columns" is better
ufo.loc[0:2]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
# rows 0 through 2 (inclusive), column 'City'
ufo.loc[0:2, 'City']
0 Ithaca 1 Willingboro 2 Holyoke Name: City, dtype: object
# rows 0 through 2 (inclusive), columns 'City' and 'State'
ufo.loc[0:2, ['City', 'State']]
City | State | |
---|---|---|
0 | Ithaca | NY |
1 | Willingboro | NJ |
2 | Holyoke | CO |
# accomplish the same thing using double brackets - but using 'loc' is preferred since it's more explicit
ufo[['City', 'State']].head(3)
City | State | |
---|---|---|
0 | Ithaca | NY |
1 | Willingboro | NJ |
2 | Holyoke | CO |
# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
ufo.loc[0:2, 'City':'State']
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY |
1 | Willingboro | NaN | OTHER | NJ |
2 | Holyoke | NaN | OVAL | CO |
# accomplish the same thing using 'head' and 'drop'
ufo.head(3).drop('Time', axis=1)
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY |
1 | Willingboro | NaN | OTHER | NJ |
2 | Holyoke | NaN | OVAL | CO |
# rows in which the 'City' is 'Oakland', column 'State'
ufo.loc[ufo.City=='Oakland', 'State']
1694 CA 2144 CA 4686 MD 7293 CA 8488 CA 8768 CA 10816 OR 10948 CA 11045 CA 12322 CA 12941 CA 16803 MD 17322 CA Name: State, dtype: object
# accomplish the same thing using "chained indexing" - but using 'loc' is preferred since chained indexing can cause problems
ufo[ufo.City=='Oakland'].State
1694 CA 2144 CA 4686 MD 7293 CA 8488 CA 8768 CA 10816 OR 10948 CA 11045 CA 12322 CA 12941 CA 16803 MD 17322 CA Name: State, dtype: object
The iloc
method is used to select rows and columns by integer position. You can pass it:
# rows in positions 0 and 1, columns in positions 0 and 3
ufo.iloc[[0, 1], [0, 3]]
City | State | |
---|---|---|
0 | Ithaca | NY |
1 | Willingboro | NJ |
# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
ufo.iloc[0:2, 0:4]
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY |
1 | Willingboro | NaN | OTHER | NJ |
# rows in positions 0 through 2 (exclusive), all columns
ufo.iloc[0:2, :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
# accomplish the same thing - but using 'iloc' is preferred since it's more explicit
ufo[0:2]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
The ix
method is used to select rows and columns by label or integer position, and should only be used when you need to mix label-based and integer-based selection in the same call.
# read a dataset of alcohol consumption into a DataFrame and set 'country' as the index
drinks = pd.read_csv('http://bit.ly/drinksbycountry', index_col='country')
drinks.head()
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|
country | |||||
Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
Albania | 89 | 132 | 54 | 4.9 | Europe |
Algeria | 25 | 0 | 14 | 0.7 | Africa |
Andorra | 245 | 138 | 312 | 12.4 | Europe |
Angola | 217 | 57 | 45 | 5.9 | Africa |
# row with label 'Albania', column in position 0
drinks.ix['Albania', 0]
89
# row in position 1, column with label 'beer_servings'
drinks.ix[1, 'beer_servings']
89
Rules for using numbers with ix
:
# rows 'Albania' through 'Andorra' (inclusive), columns in positions 0 through 2 (exclusive)
drinks.ix['Albania':'Andorra', 0:2]
beer_servings | spirit_servings | |
---|---|---|
country | ||
Albania | 89 | 132 |
Algeria | 25 | 0 |
Andorra | 245 | 138 |
# rows 0 through 2 (inclusive), columns in positions 0 through 2 (exclusive)
ufo.ix[0:2, 0:2]
City | Colors Reported | |
---|---|---|
0 | Ithaca | NaN |
1 | Willingboro | NaN |
2 | Holyoke | NaN |
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
ufo.shape
(18241, 5)
# remove the 'City' column (doesn't affect the DataFrame since inplace=False)
ufo.drop('City', axis=1).head()
Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|
0 | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | NaN | DISK | KS | 6/1/1931 13:00 |
4 | NaN | LIGHT | NY | 4/18/1933 19:00 |
# confirm that the 'City' column was not actually removed
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# remove the 'City' column (does affect the DataFrame since inplace=True)
ufo.drop('City', axis=1, inplace=True)
# confirm that the 'City' column was actually removed
ufo.head()
Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|
0 | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | NaN | DISK | KS | 6/1/1931 13:00 |
4 | NaN | LIGHT | NY | 4/18/1933 19:00 |
# drop a row if any value is missing from that row (doesn't affect the DataFrame since inplace=False)
ufo.dropna(how='any').shape
(2490, 4)
# confirm that no rows were actually removed
ufo.shape
(18241, 4)
# use an assignment statement instead of the 'inplace' parameter
ufo = ufo.set_index('Time')
ufo.tail()
Colors Reported | Shape Reported | State | |
---|---|---|---|
Time | |||
12/31/2000 23:00 | NaN | TRIANGLE | IL |
12/31/2000 23:00 | NaN | DISK | IA |
12/31/2000 23:45 | NaN | NaN | WI |
12/31/2000 23:45 | RED | LIGHT | WI |
12/31/2000 23:59 | NaN | OVAL | FL |
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method='bfill').tail()
Colors Reported | Shape Reported | State | |
---|---|---|---|
Time | |||
12/31/2000 23:00 | RED | TRIANGLE | IL |
12/31/2000 23:00 | RED | DISK | IA |
12/31/2000 23:45 | RED | LIGHT | WI |
12/31/2000 23:45 | RED | LIGHT | WI |
12/31/2000 23:59 | NaN | OVAL | FL |
# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method='ffill').tail()
Colors Reported | Shape Reported | State | |
---|---|---|---|
Time | |||
12/31/2000 23:00 | RED | TRIANGLE | IL |
12/31/2000 23:00 | RED | DISK | IA |
12/31/2000 23:45 | RED | DISK | WI |
12/31/2000 23:45 | RED | LIGHT | WI |
12/31/2000 23:59 | RED | OVAL | FL |
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# exact memory usage is unknown because object columns are references elsewhere
drinks.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 193 entries, 0 to 192 Data columns (total 6 columns): country 193 non-null object beer_servings 193 non-null int64 spirit_servings 193 non-null int64 wine_servings 193 non-null int64 total_litres_of_pure_alcohol 193 non-null float64 continent 193 non-null object dtypes: float64(1), int64(3), object(2) memory usage: 9.1+ KB
# force pandas to calculate the true memory usage
drinks.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 193 entries, 0 to 192 Data columns (total 6 columns): country 193 non-null object beer_servings 193 non-null int64 spirit_servings 193 non-null int64 wine_servings 193 non-null int64 total_litres_of_pure_alcohol 193 non-null float64 continent 193 non-null object dtypes: float64(1), int64(3), object(2) memory usage: 24.4 KB
# calculate the memory usage for each Series (in bytes)
drinks.memory_usage(deep=True)
Index 72 country 9500 beer_servings 1544 spirit_servings 1544 wine_servings 1544 total_litres_of_pure_alcohol 1544 continent 9244 dtype: int64
Documentation for info
and memory_usage
# use the 'category' data type (new in pandas 0.15) to store the 'continent' strings as integers
drinks['continent'] = drinks.continent.astype('category')
drinks.dtypes
country object beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent category dtype: object
# 'continent' Series appears to be unchanged
drinks.continent.head()
0 Asia 1 Europe 2 Africa 3 Europe 4 Africa Name: continent, dtype: category Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]
# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks.continent.cat.codes.head()
0 1 1 2 2 0 3 2 4 0 dtype: int8
# memory usage has been drastically reduced
drinks.memory_usage(deep=True)
Index 72 country 9500 beer_servings 1544 spirit_servings 1544 wine_servings 1544 total_litres_of_pure_alcohol 1544 continent 488 dtype: int64
# repeat this process for the 'country' Series
drinks['country'] = drinks.country.astype('category')
drinks.memory_usage(deep=True)
Index 72 country 9886 beer_servings 1544 spirit_servings 1544 wine_servings 1544 total_litres_of_pure_alcohol 1544 continent 488 dtype: int64
# memory usage increased because we created 193 categories
drinks.country.cat.categories
Index([u'Afghanistan', u'Albania', u'Algeria', u'Andorra', u'Angola', u'Antigua & Barbuda', u'Argentina', u'Armenia', u'Australia', u'Austria', ... u'United Arab Emirates', u'United Kingdom', u'Uruguay', u'Uzbekistan', u'Vanuatu', u'Venezuela', u'Vietnam', u'Yemen', u'Zambia', u'Zimbabwe'], dtype='object', length=193)
The category data type should only be used with a string Series that has a small number of possible values.
# create a small DataFrame from a dictionary
df = pd.DataFrame({'ID':[100, 101, 102, 103], 'quality':['good', 'very good', 'good', 'excellent']})
df
ID | quality | |
---|---|---|
0 | 100 | good |
1 | 101 | very good |
2 | 102 | good |
3 | 103 | excellent |
# sort the DataFrame by the 'quality' Series (alphabetical order)
df.sort_values('quality')
ID | quality | |
---|---|---|
3 | 103 | excellent |
0 | 100 | good |
2 | 102 | good |
1 | 101 | very good |
# define a logical ordering for the categories
df['quality'] = df.quality.astype('category', categories=['good', 'very good', 'excellent'], ordered=True)
df.quality
0 good 1 very good 2 good 3 excellent Name: quality, dtype: category Categories (3, object): [good < very good < excellent]
# sort the DataFrame by the 'quality' Series (logical order)
df.sort_values('quality')
ID | quality | |
---|---|---|
0 | 100 | good |
2 | 102 | good |
1 | 101 | very good |
3 | 103 | excellent |
# comparison operators work with ordered categories
df.loc[df.quality > 'good', :]
ID | quality | |
---|---|---|
1 | 101 | very good |
3 | 103 | excellent |
# read the training dataset from Kaggle's Titanic competition into a DataFrame
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
Goal: Predict passenger survival aboard the Titanic based on passenger attributes
# create a feature matrix 'X' by selecting two DataFrame columns
feature_cols = ['Pclass', 'Parch']
X = train.loc[:, feature_cols]
X.shape
(891, 2)
# create a response vector 'y' by selecting a Series
y = train.Survived
y.shape
(891L,)
Note: There is no need to convert these pandas objects to NumPy arrays. scikit-learn will understand these objects as long as they are entirely numeric and the proper shapes.
# fit a classification model to the training data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X, y)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
Video series: Introduction to machine learning with scikit-learn
# read the testing dataset from Kaggle's Titanic competition into a DataFrame
test = pd.read_csv('http://bit.ly/kaggletest')
test.head()
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
# create a feature matrix from the testing data that matches the training data
X_new = test.loc[:, feature_cols]
X_new.shape
(418, 2)
# use the fitted model to make predictions for the testing set observations
new_pred_class = logreg.predict(X_new)
# create a DataFrame of passenger IDs and testing set predictions
pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).head()
PassengerId | Survived | |
---|---|---|
0 | 892 | 0 |
1 | 893 | 0 |
2 | 894 | 0 |
3 | 895 | 0 |
4 | 896 | 0 |
Documentation for the DataFrame
constructor
# ensure that PassengerID is the first column by setting it as the index
pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).set_index('PassengerId').head()
Survived | |
---|---|
PassengerId | |
892 | 0 |
893 | 0 |
894 | 0 |
895 | 0 |
896 | 0 |
# write the DataFrame to a CSV file that can be submitted to Kaggle
pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).set_index('PassengerId').to_csv('sub.csv')
Documentation for to_csv
# save a DataFrame to disk ("pickle it")
train.to_pickle('train.pkl')
# read a pickled object from disk ("unpickle it")
pd.read_pickle('train.pkl').head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
Documentation for to_pickle
and read_pickle
Question: Could you explain how to read the pandas documentation?
Question: What is the difference between ufo.isnull()
and pd.isnull(ufo)
?
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# use 'isnull' as a top-level function
pd.isnull(ufo).head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | False | True | False | False | False |
1 | False | True | False | False | False |
2 | False | True | False | False | False |
3 | False | True | False | False | False |
4 | False | True | False | False | False |
# equivalent: use 'isnull' as a DataFrame method
ufo.isnull().head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | False | True | False | False | False |
1 | False | True | False | False | False |
2 | False | True | False | False | False |
3 | False | True | False | False | False |
4 | False | True | False | False | False |
Documentation for isnull
Question: Why are DataFrame slices inclusive when using .loc
, but exclusive when using .iloc
?
# label-based slicing is inclusive of the start and stop
ufo.loc[0:4, :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# position-based slicing is inclusive of the start and exclusive of the stop
ufo.iloc[0:4, :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
# 'iloc' is simply following NumPy's slicing convention...
ufo.values[0:4, :]
array([['Ithaca', nan, 'TRIANGLE', 'NY', '6/1/1930 22:00'], ['Willingboro', nan, 'OTHER', 'NJ', '6/30/1930 20:00'], ['Holyoke', nan, 'OVAL', 'CO', '2/15/1931 14:00'], ['Abilene', nan, 'DISK', 'KS', '6/1/1931 13:00']], dtype=object)
# ...and NumPy is simply following Python's slicing convention
'python'[0:4]
'pyth'
# 'loc' is inclusive of the stopping label because you don't necessarily know what label will come after it
ufo.loc[0:4, 'City':'State']
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY |
1 | Willingboro | NaN | OTHER | NJ |
2 | Holyoke | NaN | OVAL | CO |
3 | Abilene | NaN | DISK | KS |
4 | New York Worlds Fair | NaN | LIGHT | NY |
Question: How do I randomly sample rows from a DataFrame?
# sample 3 rows from the DataFrame without replacement (new in pandas 0.16.1)
ufo.sample(n=3)
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
12192 | Winston | GREEN | LIGHT | OR | 9/23/1998 21:00 |
1775 | Lake Wales | NaN | DISK | FL | 1/20/1969 19:00 |
3141 | Cannon AFB | NaN | DISK | NM | 1/6/1976 1:00 |
Documentation for sample
# use the 'random_state' parameter for reproducibility
ufo.sample(n=3, random_state=42)
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
217 | Norridgewock | NaN | DISK | ME | 9/15/1952 14:00 |
12282 | Ipava | NaN | TRIANGLE | IL | 10/1/1998 21:15 |
17933 | Ellinwood | NaN | FIREBALL | KS | 11/13/2000 22:00 |
# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(frac=0.75, random_state=99)
# store the remaining 25% of the rows in another DataFrame
test = ufo.loc[~ufo.index.isin(train.index), :]
Documentation for isin
# read the training dataset from Kaggle's Titanic competition
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# create the 'Sex_male' dummy variable using the 'map' method
train['Sex_male'] = train.Sex.map({'female':0, 'male':1})
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Sex_male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 1 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 0 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 1 |
Documentation for map
# alternative: use 'get_dummies' to create one column for every possible value
pd.get_dummies(train.Sex).head()
female | male | |
---|---|---|
0 | 0.0 | 1.0 |
1 | 1.0 | 0.0 |
2 | 1.0 | 0.0 |
3 | 1.0 | 0.0 |
4 | 0.0 | 1.0 |
Generally speaking:
# drop the first dummy variable ('female') using the 'iloc' method
pd.get_dummies(train.Sex).iloc[:, 1:].head()
male | |
---|---|
0 | 1.0 |
1 | 0.0 |
2 | 0.0 |
3 | 0.0 |
4 | 1.0 |
# add a prefix to identify the source of the dummy variables
pd.get_dummies(train.Sex, prefix='Sex').iloc[:, 1:].head()
Sex_male | |
---|---|
0 | 1.0 |
1 | 0.0 |
2 | 0.0 |
3 | 0.0 |
4 | 1.0 |
# use 'get_dummies' with a feature that has 3 possible values
pd.get_dummies(train.Embarked, prefix='Embarked').head(10)
Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|
0 | 0.0 | 0.0 | 1.0 |
1 | 1.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 1.0 |
3 | 0.0 | 0.0 | 1.0 |
4 | 0.0 | 0.0 | 1.0 |
5 | 0.0 | 1.0 | 0.0 |
6 | 0.0 | 0.0 | 1.0 |
7 | 0.0 | 0.0 | 1.0 |
8 | 0.0 | 0.0 | 1.0 |
9 | 1.0 | 0.0 | 0.0 |
# drop the first dummy variable ('C')
pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:].head(10)
Embarked_Q | Embarked_S | |
---|---|---|
0 | 0.0 | 1.0 |
1 | 0.0 | 0.0 |
2 | 0.0 | 1.0 |
3 | 0.0 | 1.0 |
4 | 0.0 | 1.0 |
5 | 1.0 | 0.0 |
6 | 0.0 | 1.0 |
7 | 0.0 | 1.0 |
8 | 0.0 | 1.0 |
9 | 0.0 | 0.0 |
How to translate these values back to the original 'Embarked' value:
# save the DataFrame of dummy variables and concatenate them to the original DataFrame
embarked_dummies = pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:]
train = pd.concat([train, embarked_dummies], axis=1)
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Sex_male | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 1 | 0.0 | 1.0 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | 0.0 | 0.0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 | 0.0 | 1.0 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 0 | 0.0 | 1.0 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 1 | 0.0 | 1.0 |
Documentation for concat
# reset the DataFrame
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
pd.get_dummies(train, columns=['Sex', 'Embarked']).head()
PassengerId | Survived | Pclass | Name | Age | SibSp | Parch | Ticket | Fare | Cabin | Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True).head()
PassengerId | Survived | Pclass | Name | Age | SibSp | Parch | Ticket | Fare | Cabin | Sex_male | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | 1.0 | 0.0 | 1.0 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | 0.0 | 0.0 | 0.0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | 0.0 | 0.0 | 1.0 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | 0.0 | 0.0 | 1.0 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | 1.0 | 0.0 | 1.0 |
Documentation for get_dummies
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# 'Time' is currently stored as a string
ufo.dtypes
City object Colors Reported object Shape Reported object State object Time object dtype: object
# hour could be accessed using string slicing, but this approach breaks too easily
ufo.Time.str.slice(-5, -3).astype(int).head()
0 22 1 20 2 14 3 13 4 19 Name: Time, dtype: int32
# convert 'Time' to datetime format
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 1930-06-01 22:00:00 |
1 | Willingboro | NaN | OTHER | NJ | 1930-06-30 20:00:00 |
2 | Holyoke | NaN | OVAL | CO | 1931-02-15 14:00:00 |
3 | Abilene | NaN | DISK | KS | 1931-06-01 13:00:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 1933-04-18 19:00:00 |
ufo.dtypes
City object Colors Reported object Shape Reported object State object Time datetime64[ns] dtype: object
Documentation for to_datetime
# convenient Series attributes are now available
ufo.Time.dt.hour.head()
0 22 1 20 2 14 3 13 4 19 Name: Time, dtype: int64
ufo.Time.dt.weekday_name.head()
0 Sunday 1 Monday 2 Sunday 3 Monday 4 Tuesday Name: Time, dtype: object
ufo.Time.dt.dayofyear.head()
0 152 1 181 2 46 3 152 4 108 Name: Time, dtype: int64
API reference for datetime properties and methods
# convert a single string to datetime format (outputs a timestamp object)
ts = pd.to_datetime('1/1/1999')
ts
Timestamp('1999-01-01 00:00:00')
# compare a datetime Series with a timestamp
ufo.loc[ufo.Time >= ts, :].head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
12832 | Loma Rica | NaN | LIGHT | CA | 1999-01-01 02:30:00 |
12833 | Bauxite | NaN | NaN | AR | 1999-01-01 03:00:00 |
12834 | Florence | NaN | CYLINDER | SC | 1999-01-01 14:00:00 |
12835 | Lake Henshaw | NaN | CIGAR | CA | 1999-01-01 15:00:00 |
12836 | Wilmington Island | NaN | LIGHT | GA | 1999-01-01 17:15:00 |
# perform mathematical operations with timestamps (outputs a timedelta object)
ufo.Time.max() - ufo.Time.min()
Timedelta('25781 days 01:59:00')
# timedelta objects also have attributes you can access
(ufo.Time.max() - ufo.Time.min()).days
25781L
# allow plots to appear in the notebook
%matplotlib inline
# count the number of UFO reports per year
ufo['Year'] = ufo.Time.dt.year
ufo.Year.value_counts().sort_index().head()
1930 2 1931 2 1933 1 1934 1 1935 1 Name: Year, dtype: int64
# plot the number of UFO reports per year (line plot is the default)
ufo.Year.value_counts().sort_index().plot()
<matplotlib.axes._subplots.AxesSubplot at 0xcc6b240>
# read a dataset of movie reviewers into a DataFrame
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('http://bit.ly/movieusers', sep='|', header=None, names=user_cols, index_col='user_id')
users.head()
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
1 | 24 | M | technician | 85711 |
2 | 53 | F | other | 94043 |
3 | 23 | M | writer | 32067 |
4 | 24 | M | technician | 43537 |
5 | 33 | F | other | 15213 |
users.shape
(943, 4)
# detect duplicate zip codes: True if an item is identical to a previous item
users.zip_code.duplicated().tail()
user_id 939 False 940 True 941 False 942 False 943 False Name: zip_code, dtype: bool
# count the duplicate items (True becomes 1, False becomes 0)
users.zip_code.duplicated().sum()
148
# detect duplicate DataFrame rows: True if an entire row is identical to a previous row
users.duplicated().tail()
user_id 939 False 940 False 941 False 942 False 943 False dtype: bool
# count the duplicate rows
users.duplicated().sum()
7
Logic for duplicated
:
keep='first'
(default): Mark duplicates as True except for the first occurrence.keep='last'
: Mark duplicates as True except for the last occurrence.keep=False
: Mark all duplicates as True.# examine the duplicate rows (ignoring the first occurrence)
users.loc[users.duplicated(keep='first'), :]
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
496 | 21 | F | student | 55414 |
572 | 51 | M | educator | 20003 |
621 | 17 | M | student | 60402 |
684 | 28 | M | student | 55414 |
733 | 44 | F | other | 60630 |
805 | 27 | F | other | 20009 |
890 | 32 | M | student | 97301 |
# examine the duplicate rows (ignoring the last occurrence)
users.loc[users.duplicated(keep='last'), :]
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
67 | 17 | M | student | 60402 |
85 | 51 | M | educator | 20003 |
198 | 21 | F | student | 55414 |
350 | 32 | M | student | 97301 |
428 | 28 | M | student | 55414 |
437 | 27 | F | other | 20009 |
460 | 44 | F | other | 60630 |
# examine the duplicate rows (including all duplicates)
users.loc[users.duplicated(keep=False), :]
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
67 | 17 | M | student | 60402 |
85 | 51 | M | educator | 20003 |
198 | 21 | F | student | 55414 |
350 | 32 | M | student | 97301 |
428 | 28 | M | student | 55414 |
437 | 27 | F | other | 20009 |
460 | 44 | F | other | 60630 |
496 | 21 | F | student | 55414 |
572 | 51 | M | educator | 20003 |
621 | 17 | M | student | 60402 |
684 | 28 | M | student | 55414 |
733 | 44 | F | other | 60630 |
805 | 27 | F | other | 20009 |
890 | 32 | M | student | 97301 |
# drop the duplicate rows (inplace=False by default)
users.drop_duplicates(keep='first').shape
(936, 4)
users.drop_duplicates(keep='last').shape
(936, 4)
users.drop_duplicates(keep=False).shape
(929, 4)
Documentation for drop_duplicates
# only consider a subset of columns when identifying duplicates
users.duplicated(subset=['age', 'zip_code']).sum()
16
users.drop_duplicates(subset=['age', 'zip_code']).shape
(927, 4)
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# count the missing values in the 'content_rating' Series
movies.content_rating.isnull().sum()
3
# examine the DataFrame rows that contain those missing values
movies[movies.content_rating.isnull()]
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
187 | 8.2 | Butch Cassidy and the Sundance Kid | NaN | Biography | 110 | [u'Paul Newman', u'Robert Redford', u'Katharin... |
649 | 7.7 | Where Eagles Dare | NaN | Action | 158 | [u'Richard Burton', u'Clint Eastwood', u'Mary ... |
936 | 7.4 | True Grit | NaN | Adventure | 128 | [u'John Wayne', u'Kim Darby', u'Glen Campbell'] |
# examine the unique values in the 'content_rating' Series
movies.content_rating.value_counts()
R 460 PG-13 189 PG 123 NOT RATED 65 APPROVED 47 UNRATED 38 G 32 PASSED 7 NC-17 7 X 4 GP 3 TV-MA 1 Name: content_rating, dtype: int64
Goal: Mark the 'NOT RATED' values as missing values, represented by 'NaN'.
# first, locate the relevant rows
movies[movies.content_rating=='NOT RATED'].head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
6 | 8.9 | The Good, the Bad and the Ugly | NOT RATED | Western | 161 | [u'Clint Eastwood', u'Eli Wallach', u'Lee Van ... |
41 | 8.5 | Sunset Blvd. | NOT RATED | Drama | 110 | [u'William Holden', u'Gloria Swanson', u'Erich... |
63 | 8.4 | M | NOT RATED | Crime | 99 | [u'Peter Lorre', u'Ellen Widmann', u'Inge Land... |
66 | 8.4 | Munna Bhai M.B.B.S. | NOT RATED | Comedy | 156 | [u'Sunil Dutt', u'Sanjay Dutt', u'Arshad Warsi'] |
# then, select the 'content_rating' Series from those rows
movies[movies.content_rating=='NOT RATED'].content_rating.head()
5 NOT RATED 6 NOT RATED 41 NOT RATED 63 NOT RATED 66 NOT RATED Name: content_rating, dtype: object
# finally, replace the 'NOT RATED' values with 'NaN' (imported from NumPy)
import numpy as np
movies[movies.content_rating=='NOT RATED'].content_rating = np.nan
c:\Users\Kevin\Anaconda\lib\site-packages\pandas\core\generic.py:2701: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self[name] = value
Problem: That statement involves two operations, a __getitem__
and a __setitem__
. pandas can't guarantee whether the __getitem__
operation returns a view or a copy of the data.
__getitem__
returns a view of the data, __setitem__
will affect the 'movies' DataFrame.__getitem__
returns a copy of the data, __setitem__
will not affect the 'movies' DataFrame.# the 'content_rating' Series has not changed
movies.content_rating.isnull().sum()
3
Solution: Use the loc
method, which replaces the 'NOT RATED' values in a single __setitem__
operation.
# replace the 'NOT RATED' values with 'NaN' (does not cause a SettingWithCopyWarning)
movies.loc[movies.content_rating=='NOT RATED', 'content_rating'] = np.nan
# this time, the 'content_rating' Series has changed
movies.content_rating.isnull().sum()
68
Summary: Use the loc
method any time you are selecting rows and columns in the same statement.
More information: Modern Pandas (Part 1)
# create a DataFrame only containing movies with a high 'star_rating'
top_movies = movies.loc[movies.star_rating >= 9, :]
top_movies
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
Goal: Fix the 'duration' for 'The Shawshank Redemption'.
# overwrite the relevant cell with the correct duration
top_movies.loc[0, 'duration'] = 150
c:\Users\Kevin\Anaconda\lib\site-packages\pandas\core\indexing.py:465: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self.obj[item] = s
Problem: pandas isn't sure whether 'top_movies' is a view or a copy of 'movies'.
# 'top_movies' DataFrame has been updated
top_movies
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 150 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
# 'movies' DataFrame has not been updated
movies.head(1)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
Solution: Any time you are attempting to create a DataFrame copy, use the copy
method.
# explicitly create a copy of 'movies'
top_movies = movies.loc[movies.star_rating >= 9, :].copy()
# pandas now knows that you are updating a copy instead of a view (does not cause a SettingWithCopyWarning)
top_movies.loc[0, 'duration'] = 150
# 'top_movies' DataFrame has been updated
top_movies
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 150 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
Documentation on indexing and selection: Returning a view versus a copy
Stack Overflow: What is the point of views in pandas if it is undefined whether an indexing operation returns a view or a copy?
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
# only 60 rows will be displayed when printing
drinks
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
5 | Antigua & Barbuda | 102 | 128 | 45 | 4.9 | North America |
6 | Argentina | 193 | 25 | 221 | 8.3 | South America |
7 | Armenia | 21 | 179 | 11 | 3.8 | Europe |
8 | Australia | 261 | 72 | 212 | 10.4 | Oceania |
9 | Austria | 279 | 75 | 191 | 9.7 | Europe |
10 | Azerbaijan | 21 | 46 | 5 | 1.3 | Europe |
11 | Bahamas | 122 | 176 | 51 | 6.3 | North America |
12 | Bahrain | 42 | 63 | 7 | 2.0 | Asia |
13 | Bangladesh | 0 | 0 | 0 | 0.0 | Asia |
14 | Barbados | 143 | 173 | 36 | 6.3 | North America |
15 | Belarus | 142 | 373 | 42 | 14.4 | Europe |
16 | Belgium | 295 | 84 | 212 | 10.5 | Europe |
17 | Belize | 263 | 114 | 8 | 6.8 | North America |
18 | Benin | 34 | 4 | 13 | 1.1 | Africa |
19 | Bhutan | 23 | 0 | 0 | 0.4 | Asia |
20 | Bolivia | 167 | 41 | 8 | 3.8 | South America |
21 | Bosnia-Herzegovina | 76 | 173 | 8 | 4.6 | Europe |
22 | Botswana | 173 | 35 | 35 | 5.4 | Africa |
23 | Brazil | 245 | 145 | 16 | 7.2 | South America |
24 | Brunei | 31 | 2 | 1 | 0.6 | Asia |
25 | Bulgaria | 231 | 252 | 94 | 10.3 | Europe |
26 | Burkina Faso | 25 | 7 | 7 | 4.3 | Africa |
27 | Burundi | 88 | 0 | 0 | 6.3 | Africa |
28 | Cote d'Ivoire | 37 | 1 | 7 | 4.0 | Africa |
29 | Cabo Verde | 144 | 56 | 16 | 4.0 | Africa |
... | ... | ... | ... | ... | ... | ... |
163 | Suriname | 128 | 178 | 7 | 5.6 | South America |
164 | Swaziland | 90 | 2 | 2 | 4.7 | Africa |
165 | Sweden | 152 | 60 | 186 | 7.2 | Europe |
166 | Switzerland | 185 | 100 | 280 | 10.2 | Europe |
167 | Syria | 5 | 35 | 16 | 1.0 | Asia |
168 | Tajikistan | 2 | 15 | 0 | 0.3 | Asia |
169 | Thailand | 99 | 258 | 1 | 6.4 | Asia |
170 | Macedonia | 106 | 27 | 86 | 3.9 | Europe |
171 | Timor-Leste | 1 | 1 | 4 | 0.1 | Asia |
172 | Togo | 36 | 2 | 19 | 1.3 | Africa |
173 | Tonga | 36 | 21 | 5 | 1.1 | Oceania |
174 | Trinidad & Tobago | 197 | 156 | 7 | 6.4 | North America |
175 | Tunisia | 51 | 3 | 20 | 1.3 | Africa |
176 | Turkey | 51 | 22 | 7 | 1.4 | Asia |
177 | Turkmenistan | 19 | 71 | 32 | 2.2 | Asia |
178 | Tuvalu | 6 | 41 | 9 | 1.0 | Oceania |
179 | Uganda | 45 | 9 | 0 | 8.3 | Africa |
180 | Ukraine | 206 | 237 | 45 | 8.9 | Europe |
181 | United Arab Emirates | 16 | 135 | 5 | 2.8 | Asia |
182 | United Kingdom | 219 | 126 | 195 | 10.4 | Europe |
183 | Tanzania | 36 | 6 | 1 | 5.7 | Africa |
184 | USA | 249 | 158 | 84 | 8.7 | North America |
185 | Uruguay | 115 | 35 | 220 | 6.6 | South America |
186 | Uzbekistan | 25 | 101 | 8 | 2.4 | Asia |
187 | Vanuatu | 21 | 18 | 11 | 0.9 | Oceania |
188 | Venezuela | 333 | 100 | 3 | 7.7 | South America |
189 | Vietnam | 111 | 2 | 1 | 2.0 | Asia |
190 | Yemen | 6 | 0 | 0 | 0.1 | Asia |
191 | Zambia | 32 | 19 | 4 | 2.5 | Africa |
192 | Zimbabwe | 64 | 18 | 4 | 4.7 | Africa |
193 rows × 6 columns
# check the current setting for the 'max_rows' option
pd.get_option('display.max_rows')
60
Documentation for get_option
# overwrite the current setting so that all rows will be displayed
pd.set_option('display.max_rows', None)
drinks
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
5 | Antigua & Barbuda | 102 | 128 | 45 | 4.9 | North America |
6 | Argentina | 193 | 25 | 221 | 8.3 | South America |
7 | Armenia | 21 | 179 | 11 | 3.8 | Europe |
8 | Australia | 261 | 72 | 212 | 10.4 | Oceania |
9 | Austria | 279 | 75 | 191 | 9.7 | Europe |
10 | Azerbaijan | 21 | 46 | 5 | 1.3 | Europe |
11 | Bahamas | 122 | 176 | 51 | 6.3 | North America |
12 | Bahrain | 42 | 63 | 7 | 2.0 | Asia |
13 | Bangladesh | 0 | 0 | 0 | 0.0 | Asia |
14 | Barbados | 143 | 173 | 36 | 6.3 | North America |
15 | Belarus | 142 | 373 | 42 | 14.4 | Europe |
16 | Belgium | 295 | 84 | 212 | 10.5 | Europe |
17 | Belize | 263 | 114 | 8 | 6.8 | North America |
18 | Benin | 34 | 4 | 13 | 1.1 | Africa |
19 | Bhutan | 23 | 0 | 0 | 0.4 | Asia |
20 | Bolivia | 167 | 41 | 8 | 3.8 | South America |
21 | Bosnia-Herzegovina | 76 | 173 | 8 | 4.6 | Europe |
22 | Botswana | 173 | 35 | 35 | 5.4 | Africa |
23 | Brazil | 245 | 145 | 16 | 7.2 | South America |
24 | Brunei | 31 | 2 | 1 | 0.6 | Asia |
25 | Bulgaria | 231 | 252 | 94 | 10.3 | Europe |
26 | Burkina Faso | 25 | 7 | 7 | 4.3 | Africa |
27 | Burundi | 88 | 0 | 0 | 6.3 | Africa |
28 | Cote d'Ivoire | 37 | 1 | 7 | 4.0 | Africa |
29 | Cabo Verde | 144 | 56 | 16 | 4.0 | Africa |
30 | Cambodia | 57 | 65 | 1 | 2.2 | Asia |
31 | Cameroon | 147 | 1 | 4 | 5.8 | Africa |
32 | Canada | 240 | 122 | 100 | 8.2 | North America |
33 | Central African Republic | 17 | 2 | 1 | 1.8 | Africa |
34 | Chad | 15 | 1 | 1 | 0.4 | Africa |
35 | Chile | 130 | 124 | 172 | 7.6 | South America |
36 | China | 79 | 192 | 8 | 5.0 | Asia |
37 | Colombia | 159 | 76 | 3 | 4.2 | South America |
38 | Comoros | 1 | 3 | 1 | 0.1 | Africa |
39 | Congo | 76 | 1 | 9 | 1.7 | Africa |
40 | Cook Islands | 0 | 254 | 74 | 5.9 | Oceania |
41 | Costa Rica | 149 | 87 | 11 | 4.4 | North America |
42 | Croatia | 230 | 87 | 254 | 10.2 | Europe |
43 | Cuba | 93 | 137 | 5 | 4.2 | North America |
44 | Cyprus | 192 | 154 | 113 | 8.2 | Europe |
45 | Czech Republic | 361 | 170 | 134 | 11.8 | Europe |
46 | North Korea | 0 | 0 | 0 | 0.0 | Asia |
47 | DR Congo | 32 | 3 | 1 | 2.3 | Africa |
48 | Denmark | 224 | 81 | 278 | 10.4 | Europe |
49 | Djibouti | 15 | 44 | 3 | 1.1 | Africa |
50 | Dominica | 52 | 286 | 26 | 6.6 | North America |
51 | Dominican Republic | 193 | 147 | 9 | 6.2 | North America |
52 | Ecuador | 162 | 74 | 3 | 4.2 | South America |
53 | Egypt | 6 | 4 | 1 | 0.2 | Africa |
54 | El Salvador | 52 | 69 | 2 | 2.2 | North America |
55 | Equatorial Guinea | 92 | 0 | 233 | 5.8 | Africa |
56 | Eritrea | 18 | 0 | 0 | 0.5 | Africa |
57 | Estonia | 224 | 194 | 59 | 9.5 | Europe |
58 | Ethiopia | 20 | 3 | 0 | 0.7 | Africa |
59 | Fiji | 77 | 35 | 1 | 2.0 | Oceania |
60 | Finland | 263 | 133 | 97 | 10.0 | Europe |
61 | France | 127 | 151 | 370 | 11.8 | Europe |
62 | Gabon | 347 | 98 | 59 | 8.9 | Africa |
63 | Gambia | 8 | 0 | 1 | 2.4 | Africa |
64 | Georgia | 52 | 100 | 149 | 5.4 | Europe |
65 | Germany | 346 | 117 | 175 | 11.3 | Europe |
66 | Ghana | 31 | 3 | 10 | 1.8 | Africa |
67 | Greece | 133 | 112 | 218 | 8.3 | Europe |
68 | Grenada | 199 | 438 | 28 | 11.9 | North America |
69 | Guatemala | 53 | 69 | 2 | 2.2 | North America |
70 | Guinea | 9 | 0 | 2 | 0.2 | Africa |
71 | Guinea-Bissau | 28 | 31 | 21 | 2.5 | Africa |
72 | Guyana | 93 | 302 | 1 | 7.1 | South America |
73 | Haiti | 1 | 326 | 1 | 5.9 | North America |
74 | Honduras | 69 | 98 | 2 | 3.0 | North America |
75 | Hungary | 234 | 215 | 185 | 11.3 | Europe |
76 | Iceland | 233 | 61 | 78 | 6.6 | Europe |
77 | India | 9 | 114 | 0 | 2.2 | Asia |
78 | Indonesia | 5 | 1 | 0 | 0.1 | Asia |
79 | Iran | 0 | 0 | 0 | 0.0 | Asia |
80 | Iraq | 9 | 3 | 0 | 0.2 | Asia |
81 | Ireland | 313 | 118 | 165 | 11.4 | Europe |
82 | Israel | 63 | 69 | 9 | 2.5 | Asia |
83 | Italy | 85 | 42 | 237 | 6.5 | Europe |
84 | Jamaica | 82 | 97 | 9 | 3.4 | North America |
85 | Japan | 77 | 202 | 16 | 7.0 | Asia |
86 | Jordan | 6 | 21 | 1 | 0.5 | Asia |
87 | Kazakhstan | 124 | 246 | 12 | 6.8 | Asia |
88 | Kenya | 58 | 22 | 2 | 1.8 | Africa |
89 | Kiribati | 21 | 34 | 1 | 1.0 | Oceania |
90 | Kuwait | 0 | 0 | 0 | 0.0 | Asia |
91 | Kyrgyzstan | 31 | 97 | 6 | 2.4 | Asia |
92 | Laos | 62 | 0 | 123 | 6.2 | Asia |
93 | Latvia | 281 | 216 | 62 | 10.5 | Europe |
94 | Lebanon | 20 | 55 | 31 | 1.9 | Asia |
95 | Lesotho | 82 | 29 | 0 | 2.8 | Africa |
96 | Liberia | 19 | 152 | 2 | 3.1 | Africa |
97 | Libya | 0 | 0 | 0 | 0.0 | Africa |
98 | Lithuania | 343 | 244 | 56 | 12.9 | Europe |
99 | Luxembourg | 236 | 133 | 271 | 11.4 | Europe |
100 | Madagascar | 26 | 15 | 4 | 0.8 | Africa |
101 | Malawi | 8 | 11 | 1 | 1.5 | Africa |
102 | Malaysia | 13 | 4 | 0 | 0.3 | Asia |
103 | Maldives | 0 | 0 | 0 | 0.0 | Asia |
104 | Mali | 5 | 1 | 1 | 0.6 | Africa |
105 | Malta | 149 | 100 | 120 | 6.6 | Europe |
106 | Marshall Islands | 0 | 0 | 0 | 0.0 | Oceania |
107 | Mauritania | 0 | 0 | 0 | 0.0 | Africa |
108 | Mauritius | 98 | 31 | 18 | 2.6 | Africa |
109 | Mexico | 238 | 68 | 5 | 5.5 | North America |
110 | Micronesia | 62 | 50 | 18 | 2.3 | Oceania |
111 | Monaco | 0 | 0 | 0 | 0.0 | Europe |
112 | Mongolia | 77 | 189 | 8 | 4.9 | Asia |
113 | Montenegro | 31 | 114 | 128 | 4.9 | Europe |
114 | Morocco | 12 | 6 | 10 | 0.5 | Africa |
115 | Mozambique | 47 | 18 | 5 | 1.3 | Africa |
116 | Myanmar | 5 | 1 | 0 | 0.1 | Asia |
117 | Namibia | 376 | 3 | 1 | 6.8 | Africa |
118 | Nauru | 49 | 0 | 8 | 1.0 | Oceania |
119 | Nepal | 5 | 6 | 0 | 0.2 | Asia |
120 | Netherlands | 251 | 88 | 190 | 9.4 | Europe |
121 | New Zealand | 203 | 79 | 175 | 9.3 | Oceania |
122 | Nicaragua | 78 | 118 | 1 | 3.5 | North America |
123 | Niger | 3 | 2 | 1 | 0.1 | Africa |
124 | Nigeria | 42 | 5 | 2 | 9.1 | Africa |
125 | Niue | 188 | 200 | 7 | 7.0 | Oceania |
126 | Norway | 169 | 71 | 129 | 6.7 | Europe |
127 | Oman | 22 | 16 | 1 | 0.7 | Asia |
128 | Pakistan | 0 | 0 | 0 | 0.0 | Asia |
129 | Palau | 306 | 63 | 23 | 6.9 | Oceania |
130 | Panama | 285 | 104 | 18 | 7.2 | North America |
131 | Papua New Guinea | 44 | 39 | 1 | 1.5 | Oceania |
132 | Paraguay | 213 | 117 | 74 | 7.3 | South America |
133 | Peru | 163 | 160 | 21 | 6.1 | South America |
134 | Philippines | 71 | 186 | 1 | 4.6 | Asia |
135 | Poland | 343 | 215 | 56 | 10.9 | Europe |
136 | Portugal | 194 | 67 | 339 | 11.0 | Europe |
137 | Qatar | 1 | 42 | 7 | 0.9 | Asia |
138 | South Korea | 140 | 16 | 9 | 9.8 | Asia |
139 | Moldova | 109 | 226 | 18 | 6.3 | Europe |
140 | Romania | 297 | 122 | 167 | 10.4 | Europe |
141 | Russian Federation | 247 | 326 | 73 | 11.5 | Asia |
142 | Rwanda | 43 | 2 | 0 | 6.8 | Africa |
143 | St. Kitts & Nevis | 194 | 205 | 32 | 7.7 | North America |
144 | St. Lucia | 171 | 315 | 71 | 10.1 | North America |
145 | St. Vincent & the Grenadines | 120 | 221 | 11 | 6.3 | North America |
146 | Samoa | 105 | 18 | 24 | 2.6 | Oceania |
147 | San Marino | 0 | 0 | 0 | 0.0 | Europe |
148 | Sao Tome & Principe | 56 | 38 | 140 | 4.2 | Africa |
149 | Saudi Arabia | 0 | 5 | 0 | 0.1 | Asia |
150 | Senegal | 9 | 1 | 7 | 0.3 | Africa |
151 | Serbia | 283 | 131 | 127 | 9.6 | Europe |
152 | Seychelles | 157 | 25 | 51 | 4.1 | Africa |
153 | Sierra Leone | 25 | 3 | 2 | 6.7 | Africa |
154 | Singapore | 60 | 12 | 11 | 1.5 | Asia |
155 | Slovakia | 196 | 293 | 116 | 11.4 | Europe |
156 | Slovenia | 270 | 51 | 276 | 10.6 | Europe |
157 | Solomon Islands | 56 | 11 | 1 | 1.2 | Oceania |
158 | Somalia | 0 | 0 | 0 | 0.0 | Africa |
159 | South Africa | 225 | 76 | 81 | 8.2 | Africa |
160 | Spain | 284 | 157 | 112 | 10.0 | Europe |
161 | Sri Lanka | 16 | 104 | 0 | 2.2 | Asia |
162 | Sudan | 8 | 13 | 0 | 1.7 | Africa |
163 | Suriname | 128 | 178 | 7 | 5.6 | South America |
164 | Swaziland | 90 | 2 | 2 | 4.7 | Africa |
165 | Sweden | 152 | 60 | 186 | 7.2 | Europe |
166 | Switzerland | 185 | 100 | 280 | 10.2 | Europe |
167 | Syria | 5 | 35 | 16 | 1.0 | Asia |
168 | Tajikistan | 2 | 15 | 0 | 0.3 | Asia |
169 | Thailand | 99 | 258 | 1 | 6.4 | Asia |
170 | Macedonia | 106 | 27 | 86 | 3.9 | Europe |
171 | Timor-Leste | 1 | 1 | 4 | 0.1 | Asia |
172 | Togo | 36 | 2 | 19 | 1.3 | Africa |
173 | Tonga | 36 | 21 | 5 | 1.1 | Oceania |
174 | Trinidad & Tobago | 197 | 156 | 7 | 6.4 | North America |
175 | Tunisia | 51 | 3 | 20 | 1.3 | Africa |
176 | Turkey | 51 | 22 | 7 | 1.4 | Asia |
177 | Turkmenistan | 19 | 71 | 32 | 2.2 | Asia |
178 | Tuvalu | 6 | 41 | 9 | 1.0 | Oceania |
179 | Uganda | 45 | 9 | 0 | 8.3 | Africa |
180 | Ukraine | 206 | 237 | 45 | 8.9 | Europe |
181 | United Arab Emirates | 16 | 135 | 5 | 2.8 | Asia |
182 | United Kingdom | 219 | 126 | 195 | 10.4 | Europe |
183 | Tanzania | 36 | 6 | 1 | 5.7 | Africa |
184 | USA | 249 | 158 | 84 | 8.7 | North America |
185 | Uruguay | 115 | 35 | 220 | 6.6 | South America |
186 | Uzbekistan | 25 | 101 | 8 | 2.4 | Asia |
187 | Vanuatu | 21 | 18 | 11 | 0.9 | Oceania |
188 | Venezuela | 333 | 100 | 3 | 7.7 | South America |
189 | Vietnam | 111 | 2 | 1 | 2.0 | Asia |
190 | Yemen | 6 | 0 | 0 | 0.1 | Asia |
191 | Zambia | 32 | 19 | 4 | 2.5 | Africa |
192 | Zimbabwe | 64 | 18 | 4 | 4.7 | Africa |
# reset the 'max_rows' option to its default
pd.reset_option('display.max_rows')
Documentation for set_option
and reset_option
# the 'max_columns' option is similar to 'max_rows'
pd.get_option('display.max_columns')
20
# read the training dataset from Kaggle's Titanic competition into a DataFrame
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# an ellipsis is displayed in the 'Name' cell of row 1 because of the 'max_colwidth' option
pd.get_option('display.max_colwidth')
50
# overwrite the current setting so that more characters will be displayed
pd.set_option('display.max_colwidth', 1000)
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Thayer) | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# overwrite the 'precision' setting to display 2 digits after the decimal point of 'Fare'
pd.set_option('display.precision', 2)
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.25 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Thayer) | female | 38.0 | 1 | 0 | PC 17599 | 71.28 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.92 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.10 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.05 | NaN | S |
# add two meaningless columns to the drinks DataFrame
drinks['x'] = drinks.wine_servings * 1000
drinks['y'] = drinks.total_litres_of_pure_alcohol * 1000
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | x | y | |
---|---|---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia | 0 | 0.0 |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe | 54000 | 4900.0 |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa | 14000 | 700.0 |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe | 312000 | 12400.0 |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa | 45000 | 5900.0 |
# use a Python format string to specify a comma as the thousands separator
pd.set_option('display.float_format', '{:,}'.format)
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | x | y | |
---|---|---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia | 0 | 0.0 |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe | 54000 | 4,900.0 |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa | 14000 | 700.0 |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe | 312000 | 12,400.0 |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa | 45000 | 5,900.0 |
# 'y' was affected (but not 'x') because the 'float_format' option only affects floats (not ints)
drinks.dtypes
country object beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object x int64 y float64 dtype: object
# view the option descriptions (including the default and current values)
pd.describe_option()
display.chop_threshold : float or None if set to a float value, all float values smaller then the given threshold will be displayed as exactly 0 by repr and friends. [default: None] [currently: None] display.colheader_justify : 'left'/'right' Controls the justification of column headers. used by DataFrameFormatter. [default: right] [currently: right] display.column_space No description available. [default: 12] [currently: 12] display.date_dayfirst : boolean When True, prints and parses dates with the day first, eg 20/01/2005 [default: False] [currently: False] display.date_yearfirst : boolean When True, prints and parses dates with the year first, eg 2005/01/20 [default: False] [currently: False] display.encoding : str/unicode Defaults to the detected encoding of the console. Specifies the encoding to be used for strings returned by to_string, these are generally strings meant to be displayed on the console. [default: UTF-8] [currently: UTF-8] display.expand_frame_repr : boolean Whether to print out the full DataFrame repr for wide DataFrames across multiple lines, `max_columns` is still respected, but the output will wrap-around across multiple "pages" if its width exceeds `display.width`. [default: True] [currently: True] display.float_format : callable The callable should accept a floating point number and return a string with the desired format of the number. This is used in some places like SeriesFormatter. See formats.format.EngFormatter for an example. [default: None] [currently: <built-in method format of str object at 0x000000000CAB1F58>] display.height : int Deprecated. [default: 60] [currently: 60] (Deprecated, use `display.max_rows` instead.) display.large_repr : 'truncate'/'info' For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can show a truncated table (the default from 0.13), or switch to the view from df.info() (the behaviour in earlier versions of pandas). [default: truncate] [currently: truncate] display.latex.escape : bool This specifies if the to_latex method of a Dataframe uses escapes special characters. method. Valid values: False,True [default: True] [currently: True] display.latex.longtable :bool This specifies if the to_latex method of a Dataframe uses the longtable format. method. Valid values: False,True [default: False] [currently: False] display.latex.repr : boolean Whether to produce a latex DataFrame representation for jupyter environments that support it. (default: False) [default: False] [currently: False] display.line_width : int Deprecated. [default: 80] [currently: 80] (Deprecated, use `display.width` instead.) display.max_categories : int This sets the maximum number of categories pandas should output when printing out a `Categorical` or a Series of dtype "category". [default: 8] [currently: 8] display.max_columns : int If max_cols is exceeded, switch to truncate view. Depending on `large_repr`, objects are either centrally truncated or printed as a summary view. 'None' value means unlimited. In case python/IPython is running in a terminal and `large_repr` equals 'truncate' this can be set to 0 and pandas will auto-detect the width of the terminal and print a truncated object which fits the screen width. The IPython notebook, IPython qtconsole, or IDLE do not run in a terminal and hence it is not possible to do correct auto-detection. [default: 20] [currently: 20] display.max_colwidth : int The maximum width in characters of a column in the repr of a pandas data structure. When the column overflows, a "..." placeholder is embedded in the output. [default: 50] [currently: 1000] display.max_info_columns : int max_info_columns is used in DataFrame.info method to decide if per column information will be printed. [default: 100] [currently: 100] display.max_info_rows : int or None df.info() will usually show null-counts for each column. For large frames this can be quite slow. max_info_rows and max_info_cols limit this null check only to frames with smaller dimensions than specified. [default: 1690785] [currently: 1690785] display.max_rows : int If max_rows is exceeded, switch to truncate view. Depending on `large_repr`, objects are either centrally truncated or printed as a summary view. 'None' value means unlimited. In case python/IPython is running in a terminal and `large_repr` equals 'truncate' this can be set to 0 and pandas will auto-detect the height of the terminal and print a truncated object which fits the screen height. The IPython notebook, IPython qtconsole, or IDLE do not run in a terminal and hence it is not possible to do correct auto-detection. [default: 60] [currently: 60] display.max_seq_items : int or None when pretty-printing a long sequence, no more then `max_seq_items` will be printed. If items are omitted, they will be denoted by the addition of "..." to the resulting string. If set to None, the number of items to be printed is unlimited. [default: 100] [currently: 100] display.memory_usage : bool, string or None This specifies if the memory usage of a DataFrame should be displayed when df.info() is called. Valid values True,False,'deep' [default: True] [currently: True] display.mpl_style : bool Setting this to 'default' will modify the rcParams used by matplotlib to give plots a more pleasing visual style by default. Setting this to None/False restores the values to their initial value. [default: None] [currently: None] display.multi_sparse : boolean "sparsify" MultiIndex display (don't display repeated elements in outer levels within groups) [default: True] [currently: True] display.notebook_repr_html : boolean When True, IPython notebook will use html representation for pandas objects (if it is available). [default: True] [currently: True] display.pprint_nest_depth : int Controls the number of nested levels to process when pretty-printing [default: 3] [currently: 3] display.precision : int Floating point output precision (number of significant digits). This is only a suggestion [default: 6] [currently: 2] display.show_dimensions : boolean or 'truncate' Whether to print out dimensions at the end of DataFrame repr. If 'truncate' is specified, only print out the dimensions if the frame is truncated (e.g. not display all rows and/or columns) [default: truncate] [currently: truncate] display.unicode.ambiguous_as_wide : boolean Whether to use the Unicode East Asian Width to calculate the display text width. Enabling this may affect to the performance (default: False) [default: False] [currently: False] display.unicode.east_asian_width : boolean Whether to use the Unicode East Asian Width to calculate the display text width. Enabling this may affect to the performance (default: False) [default: False] [currently: False] display.width : int Width of the display in characters. In case python/IPython is running in a terminal this can be set to None and pandas will correctly auto-detect the width. Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a terminal and hence it is not possible to correctly detect the width. [default: 80] [currently: 80] io.excel.xls.writer : string The default Excel writer engine for 'xls' files. Available options: 'xlwt' (the default). [default: xlwt] [currently: xlwt] io.excel.xlsm.writer : string The default Excel writer engine for 'xlsm' files. Available options: 'openpyxl' (the default). [default: openpyxl] [currently: openpyxl] io.excel.xlsx.writer : string The default Excel writer engine for 'xlsx' files. Available options: 'xlsxwriter' (the default), 'openpyxl'. [default: xlsxwriter] [currently: xlsxwriter] io.hdf.default_format : format default format writing format, if None, then put will default to 'fixed' and append will default to 'table' [default: None] [currently: None] io.hdf.dropna_table : boolean drop ALL nan rows when appending to a table [default: False] [currently: False] mode.chained_assignment : string Raise an exception, warn, or no action if trying to use chained assignment, The default is warn [default: warn] [currently: warn] mode.sim_interactive : boolean Whether to simulate interactive mode for purposes of testing [default: False] [currently: False] mode.use_inf_as_null : boolean True means treat None, NaN, INF, -INF as null (old way), False means None and NaN are null, but INF, -INF are not null (new way). [default: False] [currently: False]
# search for specific options by name
pd.describe_option('rows')
display.max_info_rows : int or None df.info() will usually show null-counts for each column. For large frames this can be quite slow. max_info_rows and max_info_cols limit this null check only to frames with smaller dimensions than specified. [default: 1690785] [currently: 1690785] display.max_rows : int If max_rows is exceeded, switch to truncate view. Depending on `large_repr`, objects are either centrally truncated or printed as a summary view. 'None' value means unlimited. In case python/IPython is running in a terminal and `large_repr` equals 'truncate' this can be set to 0 and pandas will auto-detect the height of the terminal and print a truncated object which fits the screen height. The IPython notebook, IPython qtconsole, or IDLE do not run in a terminal and hence it is not possible to do correct auto-detection. [default: 60] [currently: 60]
Documentation for describe_option
# reset all of the options to their default values
pd.reset_option('all')
height has been deprecated. line_width has been deprecated, use display.width instead (currently both are identical)
c:\Users\Kevin\Anaconda\lib\site-packages\ipykernel\__main__.py:2: FutureWarning: mpl_style had been deprecated and will be removed in a future version. Use `matplotlib.pyplot.style.use` instead. from ipykernel import kernelapp as app
# create a DataFrame from a dictionary (keys become column names, values become data)
pd.DataFrame({'id':[100, 101, 102], 'color':['red', 'blue', 'red']})
color | id | |
---|---|---|
0 | red | 100 |
1 | blue | 101 |
2 | red | 102 |
# optionally specify the order of columns and define the index
df = pd.DataFrame({'id':[100, 101, 102], 'color':['red', 'blue', 'red']}, columns=['id', 'color'], index=['a', 'b', 'c'])
df
id | color | |
---|---|---|
a | 100 | red |
b | 101 | blue |
c | 102 | red |
Documentation for DataFrame
# create a DataFrame from a list of lists (each inner list becomes a row)
pd.DataFrame([[100, 'red'], [101, 'blue'], [102, 'red']], columns=['id', 'color'])
id | color | |
---|---|---|
0 | 100 | red |
1 | 101 | blue |
2 | 102 | red |
# create a NumPy array (with shape 4 by 2) and fill it with random numbers between 0 and 1
import numpy as np
arr = np.random.rand(4, 2)
arr
array([[ 0.9325265 , 0.48261452], [ 0.03239681, 0.94908844], [ 0.17615564, 0.80045853], [ 0.36113859, 0.95982213]])
# create a DataFrame from the NumPy array
pd.DataFrame(arr, columns=['one', 'two'])
one | two | |
---|---|---|
0 | 0.932527 | 0.482615 |
1 | 0.032397 | 0.949088 |
2 | 0.176156 | 0.800459 |
3 | 0.361139 | 0.959822 |
# create a DataFrame of student IDs (100 through 109) and test scores (random integers between 60 and 100)
pd.DataFrame({'student':np.arange(100, 110, 1), 'test':np.random.randint(60, 101, 10)})
student | test | |
---|---|---|
0 | 100 | 78 |
1 | 101 | 83 |
2 | 102 | 79 |
3 | 103 | 85 |
4 | 104 | 73 |
5 | 105 | 91 |
6 | 106 | 81 |
7 | 107 | 86 |
8 | 108 | 88 |
9 | 109 | 96 |
# 'set_index' can be chained with the DataFrame constructor to select an index
pd.DataFrame({'student':np.arange(100, 110, 1), 'test':np.random.randint(60, 101, 10)}).set_index('student')
test | |
---|---|
student | |
100 | 78 |
101 | 71 |
102 | 90 |
103 | 63 |
104 | 83 |
105 | 92 |
106 | 97 |
107 | 67 |
108 | 71 |
109 | 79 |
Documentation for set_index
# create a new Series using the Series constructor
s = pd.Series(['round', 'square'], index=['c', 'b'], name='shape')
s
c round b square Name: shape, dtype: object
Documentation for Series
# concatenate the DataFrame and the Series (use axis=1 to concatenate columns)
pd.concat([df, s], axis=1)
id | color | shape | |
---|---|---|---|
a | 100 | red | NaN |
b | 101 | blue | square |
c | 102 | red | round |
Notes:
Documentation for concat
# read the training dataset from Kaggle's Titanic competition into a DataFrame
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
Goal: Map the existing values of a Series to a different set of values
Method: map
(Series method)
# map 'female' to 0 and 'male' to 1
train['Sex_num'] = train.Sex.map({'female':0, 'male':1})
train.loc[0:4, ['Sex', 'Sex_num']]
Sex | Sex_num | |
---|---|---|
0 | male | 1 |
1 | female | 0 |
2 | female | 0 |
3 | female | 0 |
4 | male | 1 |
Goal: Apply a function to each element in a Series
Method: apply
(Series method)
Note: map
can be substituted for apply
in many cases, but apply
is more flexible and thus is recommended
# calculate the length of each string in the 'Name' Series
train['Name_length'] = train.Name.apply(len)
train.loc[0:4, ['Name', 'Name_length']]
Name | Name_length | |
---|---|---|
0 | Braund, Mr. Owen Harris | 23 |
1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 51 |
2 | Heikkinen, Miss. Laina | 22 |
3 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 44 |
4 | Allen, Mr. William Henry | 24 |
# round up each element in the 'Fare' Series to the next integer
import numpy as np
train['Fare_ceil'] = train.Fare.apply(np.ceil)
train.loc[0:4, ['Fare', 'Fare_ceil']]
Fare | Fare_ceil | |
---|---|---|
0 | 7.2500 | 8.0 |
1 | 71.2833 | 72.0 |
2 | 7.9250 | 8.0 |
3 | 53.1000 | 54.0 |
4 | 8.0500 | 9.0 |
# we want to extract the last name of each person
train.Name.head()
0 Braund, Mr. Owen Harris 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 2 Heikkinen, Miss. Laina 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 Allen, Mr. William Henry Name: Name, dtype: object
# use a string method to split the 'Name' Series at commas (returns a Series of lists)
train.Name.str.split(',').head()
0 [Braund, Mr. Owen Harris] 1 [Cumings, Mrs. John Bradley (Florence Briggs ... 2 [Heikkinen, Miss. Laina] 3 [Futrelle, Mrs. Jacques Heath (Lily May Peel)] 4 [Allen, Mr. William Henry] Name: Name, dtype: object
# define a function that returns an element from a list based on position
def get_element(my_list, position):
return my_list[position]
# apply the 'get_element' function and pass 'position' as a keyword argument
train.Name.str.split(',').apply(get_element, position=0).head()
0 Braund 1 Cumings 2 Heikkinen 3 Futrelle 4 Allen Name: Name, dtype: object
# alternatively, use a lambda function
train.Name.str.split(',').apply(lambda x: x[0]).head()
0 Braund 1 Cumings 2 Heikkinen 3 Futrelle 4 Allen Name: Name, dtype: object
Goal: Apply a function along either axis of a DataFrame
Method: apply
(DataFrame method)
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# select a subset of the DataFrame to work with
drinks.loc[:, 'beer_servings':'wine_servings'].head()
beer_servings | spirit_servings | wine_servings | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 89 | 132 | 54 |
2 | 25 | 0 | 14 |
3 | 245 | 138 | 312 |
4 | 217 | 57 | 45 |
# apply the 'max' function along axis 0 to calculate the maximum value in each column
drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=0)
beer_servings 376 spirit_servings 438 wine_servings 370 dtype: int64
# apply the 'max' function along axis 1 to calculate the maximum value in each row
drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=1).head()
0 0 1 132 2 25 3 312 4 217 dtype: int64
# use 'np.argmax' to calculate which column has the maximum value for each row
drinks.loc[:, 'beer_servings':'wine_servings'].apply(np.argmax, axis=1).head()
0 beer_servings 1 spirit_servings 2 beer_servings 3 wine_servings 4 beer_servings dtype: object
Goal: Apply a function to every element in a DataFrame
Method: applymap
(DataFrame method)
# convert every DataFrame element into a float
drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float).head()
beer_servings | spirit_servings | wine_servings | |
---|---|---|---|
0 | 0.0 | 0.0 | 0.0 |
1 | 89.0 | 132.0 | 54.0 |
2 | 25.0 | 0.0 | 14.0 |
3 | 245.0 | 138.0 | 312.0 |
4 | 217.0 | 57.0 | 45.0 |
# overwrite the existing DataFrame columns
drinks.loc[:, 'beer_servings':'wine_servings'] = drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float)
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0.0 | 0.0 | 0.0 | 0.0 | Asia |
1 | Albania | 89.0 | 132.0 | 54.0 | 4.9 | Europe |
2 | Algeria | 25.0 | 0.0 | 14.0 | 0.7 | Africa |
3 | Andorra | 245.0 | 138.0 | 312.0 | 12.4 | Europe |
4 | Angola | 217.0 | 57.0 | 45.0 | 5.9 | Africa |