import pandas as pd
# read a dataset of movie reviewers
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('./data/movie.user',
sep='|',
header=None,
names=user_cols)
# examine the first 5 rows
users.head()
user_id | age | gender | occupation | zip_code | |
---|---|---|---|---|---|
0 | 1 | 24 | M | technician | 85711 |
1 | 2 | 53 | F | other | 94043 |
2 | 3 | 23 | M | writer | 32067 |
3 | 4 | 24 | M | technician | 43537 |
4 | 5 | 33 | F | other | 15213 |
# read_csv is equivalent to read_table, except it assumes a comma separator
ufo = pd.read_csv('./data/ufo.csv')
# examine the first 5 rows
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# specify which columns to include by name
pd.read_csv('./data/ufo.csv', usecols=['City', 'State']).head
<bound method NDFrame.head of City State 0 Ithaca NY 1 Willingboro NJ 2 Holyoke CO 3 Abilene KS 4 New York Worlds Fair NY ... ... ... 18236 Grant Park IL 18237 Spirit Lake IA 18238 Eagle River WI 18239 Eagle River WI 18240 Ybor FL [18241 rows x 2 columns]>
# specify columns by position
pd.read_csv('ufo.csv', usecols=[0, 4]).head
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[7], line 2 1 # specify columns by position ----> 2 pd.read_csv('ufo.csv', usecols=[0, 4]).head File /usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) File /usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs) 325 if len(args) > num_allow_args: 326 warnings.warn( 327 msg.format(arguments=_format_argument_list(allow_args)), 328 FutureWarning, 329 stacklevel=find_stack_level(), 330 ) --> 331 return func(*args, **kwargs) File /usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options) 935 kwds_defaults = _refine_defaults_read( 936 dialect, 937 delimiter, (...) 946 defaults={"delimiter": ","}, 947 ) 948 kwds.update(kwds_defaults) --> 950 return _read(filepath_or_buffer, kwds) File /usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds) 602 _validate_names(kwds.get("names", None)) 604 # Create the parser. --> 605 parser = TextFileReader(filepath_or_buffer, **kwds) 607 if chunksize or iterator: 608 return parser File /usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds) 1439 self.options["has_index_names"] = kwds["has_index_names"] 1441 self.handles: IOHandles | None = None -> 1442 self._engine = self._make_engine(f, self.engine) File /usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine) 1733 if "b" not in mode: 1734 mode += "b" -> 1735 self.handles = get_handle( 1736 f, 1737 mode, 1738 encoding=self.options.get("encoding", None), 1739 compression=self.options.get("compression", None), 1740 memory_map=self.options.get("memory_map", False), 1741 is_text=is_text, 1742 errors=self.options.get("encoding_errors", "strict"), 1743 storage_options=self.options.get("storage_options", None), 1744 ) 1745 assert self.handles is not None 1746 f = self.handles.handle File /usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/common.py:856, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 851 elif isinstance(handle, str): 852 # Check whether the filename is to be opened in binary mode. 853 # Binary mode does not support 'encoding' and 'newline'. 854 if ioargs.encoding and "b" not in ioargs.mode: 855 # Encoding --> 856 handle = open( 857 handle, 858 ioargs.mode, 859 encoding=ioargs.encoding, 860 errors=errors, 861 newline="", 862 ) 863 else: 864 # Binary mode 865 handle = open(handle, ioargs.mode) FileNotFoundError: [Errno 2] No such file or directory: 'ufo.csv'
# specify how many rows to read
pd.read_csv('ufo.csv', nrows=3).head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('imdb_1000.csv')
# example method: show the first 5 rows
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# example attribute: number of rows and columns
movies.shape
(979, 6)
# example attribute: data type of each column
movies.dtypes
star_rating float64 title object content_rating object genre object duration int64 actors_list object dtype: object
# example method: calculate summary statistics
movies.describe()
star_rating | duration | |
---|---|---|
count | 979.000000 | 979.000000 |
mean | 7.889785 | 120.979571 |
std | 0.336069 | 26.218010 |
min | 7.400000 | 64.000000 |
25% | 7.600000 | 102.000000 |
50% | 7.800000 | 117.000000 |
75% | 8.100000 | 134.000000 |
max | 9.300000 | 242.000000 |
# use an optional parameter to the describe method to summarize only 'object' columns
movies.describe(include=['object'])
title | content_rating | genre | actors_list | |
---|---|---|---|---|
count | 979 | 976 | 979 | 979 |
unique | 975 | 12 | 16 | 969 |
top | Les Miserables | R | Drama | [u'Daniel Radcliffe', u'Emma Watson', u'Rupert... |
freq | 2 | 460 | 278 | 6 |
# create a new 'Location' Series (must use bracket notation to define the Series name)
ufo['Location'] = ufo.City + ', ' + ufo.State
ufo.head()
City | Colors Reported | Shape Reported | State | Time | Location | |
---|---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 | Ithaca, NY |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 | Willingboro, NJ |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 | Holyoke, CO |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 | Abilene, KS |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 | New York Worlds Fair, NY |
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('ufo.csv')
# examine the column names
ufo.columns
Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')
# rename two of the columns by using the 'rename' method
ufo.rename(columns={
'Colors Reported': 'Colors_Reported',
'Shape Reported': 'Shape_Reported'
},
inplace=True)
ufo.columns
Index(['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time'], dtype='object')
# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']
ufo.columns = ufo_cols
ufo.columns
Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')
# replace the column names during the file reading process by using the 'names' parameter
ufo = pd.read_csv('ufo.csv', header=0, names=ufo_cols)
ufo.columns
Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')
# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns
Index(['city', 'colors_reported', 'shape_reported', 'state', 'time'], dtype='object')
ufo = pd.read_csv('ufo.csv')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# remove a single column (axis=1 refers to columns)
ufo.drop('Colors Reported', axis=1, inplace=True)
ufo.head()
City | Shape Reported | State | Time | |
---|---|---|---|---|
0 | Ithaca | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | LIGHT | NY | 4/18/1933 19:00 |
# remove multiple columns at once
ufo.drop(columns=['City', 'State']).head()
Shape Reported | Time | |
---|---|---|
0 | TRIANGLE | 6/1/1930 22:00 |
1 | OTHER | 6/30/1930 20:00 |
2 | OVAL | 2/15/1931 14:00 |
3 | DISK | 6/1/1931 13:00 |
4 | LIGHT | 4/18/1933 19:00 |
# new way to drop rows: specify index
ufo.drop(index=[0, 1]).head()
City | Shape Reported | State | Time | |
---|---|---|---|---|
2 | Holyoke | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | LIGHT | NY | 4/18/1933 19:00 |
5 | Valley City | DISK | ND | 9/15/1934 15:30 |
6 | Crater Lake | CIRCLE | CA | 6/15/1935 0:00 |
ufo = pd.read_csv('ufo.csv')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# Series are directly iterable (like a list)
for c in ufo.City[:10]:
print(c)
Ithaca Willingboro Holyoke Abilene New York Worlds Fair Valley City Crater Lake Alma Eklutna Hubbard
# various methods are available to iterate through a DataFrame
for index, row in ufo[:10].iterrows():
print(index, row.City, row.State)
0 Ithaca NY 1 Willingboro NJ 2 Holyoke CO 3 Abilene KS 4 New York Worlds Fair NY 5 Valley City ND 6 Crater Lake CA 7 Alma MI 8 Eklutna AK 9 Hubbard OR
# index and columns both default to integers if you don't define them
pd.read_table('movie.user', header=None, sep='|').head()
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 1 | 24 | M | technician | 85711 |
1 | 2 | 53 | F | other | 94043 |
2 | 3 | 23 | M | writer | 32067 |
3 | 4 | 24 | M | technician | 43537 |
4 | 5 | 33 | F | other | 15213 |
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('drinks.csv')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# every DataFrame has an index (sometimes called the "row labels")
drinks.index
RangeIndex(start=0, stop=193, step=1)
# set an existing column as the index
drinks.set_index('country', inplace=True)
drinks.head()
# 'country' is now the index
drinks.index
Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', ... 'Tanzania', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'], dtype='object', name='country', length=193)
# restore the index name, and move the index back to a column
drinks.index.name = 'country'
drinks.reset_index(inplace=True)
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# select the 'City' Series using bracket notation
ufo['City']
# or equivalently, use dot notation
ufo.City
0 Ithaca 1 Willingboro 2 Holyoke 3 Abilene 4 New York Worlds Fair ... 18236 Grant Park 18237 Spirit Lake 18238 Eagle River 18239 Eagle River 18240 Ybor Name: City, Length: 18241, dtype: object
# read a dataset of alcohol consumption into a DataFrame, and check the data types
drinks = pd.read_csv('drinks.csv')
drinks.dtypes
country object beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
import numpy as np
drinks.select_dtypes(include=[np.number]).dtypes
beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 dtype: object
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('ufo.csv')
ufo.head(3)
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
# row 0, all columns
ufo.loc[0, :]
City Ithaca Colors Reported NaN Shape Reported TRIANGLE State NY Time 6/1/1930 22:00 Name: 0, dtype: object
# rows 0 and 1 and 2, all columns
ufo.loc[[0, 1, 2], :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
# rows 0 through 2 (inclusive), all columns
ufo.loc[0:2, :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
# rows 0 through 2 (inclusive), column 'City'
ufo.loc[0:2, 'City']
0 Ithaca 1 Willingboro 2 Holyoke Name: City, dtype: object
# rows 0 through 2 (inclusive), columns 'City' and 'State'
ufo.loc[0:2, ['City', 'State']]
City | State | |
---|---|---|
0 | Ithaca | NY |
1 | Willingboro | NJ |
2 | Holyoke | CO |
# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
ufo.loc[0:2, 'City':'State']
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY |
1 | Willingboro | NaN | OTHER | NJ |
2 | Holyoke | NaN | OVAL | CO |
# rows in positions 0 and 1, columns in positions 0 and 3
ufo.iloc[[0, 1], [0, 3]]
City | State | |
---|---|---|
0 | Ithaca | NY |
1 | Willingboro | NJ |
# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
ufo.iloc[0:2, 0:4]
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY |
1 | Willingboro | NaN | OTHER | NJ |
# rows in positions 0 through 2 (exclusive), all columns
ufo.iloc[0:2, :]
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
drinks.loc[drinks.continent=='South America']
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|
country | |||||
Argentina | 193 | 25 | 221 | 8.3 | South America |
Bolivia | 167 | 41 | 8 | 3.8 | South America |
Brazil | 245 | 145 | 16 | 7.2 | South America |
Chile | 130 | 124 | 172 | 7.6 | South America |
Colombia | 159 | 76 | 3 | 4.2 | South America |
Ecuador | 162 | 74 | 3 | 4.2 | South America |
Guyana | 93 | 302 | 1 | 7.1 | South America |
Paraguay | 213 | 117 | 74 | 7.3 | South America |
Peru | 163 | 160 | 21 | 6.1 | South America |
Suriname | 128 | 178 | 7 | 5.6 | South America |
Uruguay | 115 | 35 | 220 | 6.6 | South America |
Venezuela | 333 | 100 | 3 | 7.7 | South America |
drinks.loc[drinks.continent=='South America', 'wine_servings']
6 221 20 8 23 16 35 172 37 3 52 3 72 1 132 74 133 21 163 7 185 220 188 3 Name: wine_servings, dtype: int64
drinks.query("continent=='South America'")['wine_servings']
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|
country | |||||
Argentina | 193 | 25 | 221 | 8.3 | South America |
Bolivia | 167 | 41 | 8 | 3.8 | South America |
Brazil | 245 | 145 | 16 | 7.2 | South America |
Chile | 130 | 124 | 172 | 7.6 | South America |
Colombia | 159 | 76 | 3 | 4.2 | South America |
Ecuador | 162 | 74 | 3 | 4.2 | South America |
Guyana | 93 | 302 | 1 | 7.1 | South America |
Paraguay | 213 | 117 | 74 | 7.3 | South America |
Peru | 163 | 160 | 21 | 6.1 | South America |
Suriname | 128 | 178 | 7 | 5.6 | South America |
Uruguay | 115 | 35 | 220 | 6.6 | South America |
Venezuela | 333 | 100 | 3 | 7.7 | South America |
movies = pd.read_csv('pandas/imdb_1000.csv')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# use the '&' operator to specify that both conditions are required
movies.loc[(movies.duration >=200) & (movies.genre == 'Drama')]
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
157 | 8.2 | Gone with the Wind | G | Drama | 238 | [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... |
476 | 7.8 | Hamlet | PG-13 | Drama | 242 | [u'Kenneth Branagh', u'Julie Christie', u'Dere... |
# using the '|' operator would have shown movies that are either long or dramas (or both)
movies.loc[(movies.duration >=200) | (movies.genre == 'Drama')].head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
9 | 8.9 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... |
13 | 8.8 | Forrest Gump | PG-13 | Drama | 142 | [u'Tom Hanks', u'Robin Wright', u'Gary Sinise'] |
movies.loc[movies.genre.isin(['Crime', 'Drama', 'Action'])].head(10)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
9 | 8.9 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... |
11 | 8.8 | Inception | PG-13 | Action | 148 | [u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'... |
12 | 8.8 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 124 | [u'Mark Hamill', u'Harrison Ford', u'Carrie Fi... |
13 | 8.8 | Forrest Gump | PG-13 | Drama | 142 | [u'Tom Hanks', u'Robin Wright', u'Gary Sinise'] |
drinks = pd.read_csv('pandas/drinks.csv')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
drinks.continent.value_counts()
Africa 53 Europe 45 Asia 44 North America 23 Oceania 16 South America 12 Name: continent, dtype: int64
drinks.continent.value_counts(normalize=True)
Africa 0.274611 Europe 0.233161 Asia 0.227979 North America 0.119171 Oceania 0.082902 South America 0.062176 Name: continent, dtype: float64
# access the Series index
drinks.continent.value_counts().index
Index(['Africa', 'Europe', 'Asia', 'North America', 'Oceania', 'South America'], dtype='object')
# access the Series values
drinks.continent.value_counts().values
array([53, 45, 44, 23, 16, 12])
# any Series can be sorted by its values
drinks.continent.value_counts().sort_values()
South America 12 Oceania 16 North America 23 Asia 44 Europe 45 Africa 53 Name: continent, dtype: int64
# any Series can also be sorted by its index
drinks.continent.value_counts().sort_index()
Africa 53 Asia 44 Europe 45 North America 23 Oceania 16 South America 12 Name: continent, dtype: int64
drinks.continent.unique()
array(['Asia', 'Europe', 'Africa', 'North America', 'South America', 'Oceania'], dtype=object)
drinks.continent.nunique()
6
movies = pd.read_csv('pandas/imdb_1000.csv')
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# sort the 'title' Series in ascending order (returns a Series)
movies.title.sort_values().head()
542 (500) Days of Summer 5 12 Angry Men 201 12 Years a Slave 698 127 Hours 110 2001: A Space Odyssey Name: title, dtype: object
# sort in descending order instead
movies.title.sort_values(ascending=False).head()
864 [Rec] 526 Zulu 615 Zombieland 677 Zodiac 955 Zero Dark Thirty Name: title, dtype: object
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort_values('title').head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
542 | 7.8 | (500) Days of Summer | PG-13 | Comedy | 95 | [u'Zooey Deschanel', u'Joseph Gordon-Levitt', ... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
201 | 8.1 | 12 Years a Slave | R | Biography | 134 | [u'Chiwetel Ejiofor', u'Michael Kenneth Willia... |
698 | 7.6 | 127 Hours | R | Adventure | 94 | [u'James Franco', u'Amber Tamblyn', u'Kate Mara'] |
110 | 8.3 | 2001: A Space Odyssey | G | Mystery | 160 | [u'Keir Dullea', u'Gary Lockwood', u'William S... |
# sort in descending order instead
movies.sort_values('title', ascending=False).head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
864 | 7.5 | [Rec] | R | Horror | 78 | [u'Manuela Velasco', u'Ferran Terraza', u'Jorg... |
526 | 7.8 | Zulu | UNRATED | Drama | 138 | [u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac... |
615 | 7.7 | Zombieland | R | Comedy | 88 | [u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha... |
677 | 7.7 | Zodiac | R | Crime | 157 | [u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M... |
955 | 7.4 | Zero Dark Thirty | R | Drama | 157 | [u'Jessica Chastain', u'Joel Edgerton', u'Chri... |
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort_values(['content_rating', 'duration']).head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
713 | 7.6 | The Jungle Book | APPROVED | Animation | 78 | [u'Phil Harris', u'Sebastian Cabot', u'Louis P... |
513 | 7.8 | Invasion of the Body Snatchers | APPROVED | Horror | 80 | [u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga... |
272 | 8.1 | The Killing | APPROVED | Crime | 85 | [u'Sterling Hayden', u'Coleen Gray', u'Vince E... |
703 | 7.6 | Dracula | APPROVED | Horror | 85 | [u'Bela Lugosi', u'Helen Chandler', u'David Ma... |
612 | 7.7 | A Hard Day's Night | APPROVED | Comedy | 87 | [u'John Lennon', u'Paul McCartney', u'George H... |
drinks = pd.read_csv('pandas/drinks.csv')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# calculate the mean beer servings for each continent
drinks.groupby('continent').beer_servings.mean()
continent Africa 61.471698 Asia 37.045455 Europe 193.777778 North America 145.434783 Oceania 89.687500 South America 175.083333 Name: beer_servings, dtype: float64
# other aggregation functions (such as 'max') can also be used with groupby
drinks.groupby('continent').beer_servings.max()
continent Africa 376 Asia 247 Europe 361 North America 285 Oceania 306 South America 333 Name: beer_servings, dtype: int64
# multiple aggregation functions can be applied simultaneously
drinks.groupby('continent').beer_servings.agg(['count', 'mean', 'min', 'max'])
count | mean | min | max | |
---|---|---|---|---|
continent | ||||
Africa | 53 | 61.471698 | 0 | 376 |
Asia | 44 | 37.045455 | 0 | 247 |
Europe | 45 | 193.777778 | 0 | 361 |
North America | 23 | 145.434783 | 1 | 285 |
Oceania | 16 | 89.687500 | 0 | 306 |
South America | 12 | 175.083333 | 93 | 333 |
pd.crosstab(drinks.country, drinks.continent)
continent | Africa | Asia | Europe | North America | Oceania | South America |
---|---|---|---|---|---|---|
country | ||||||
Afghanistan | 0 | 1 | 0 | 0 | 0 | 0 |
Albania | 0 | 0 | 1 | 0 | 0 | 0 |
Algeria | 1 | 0 | 0 | 0 | 0 | 0 |
Andorra | 0 | 0 | 1 | 0 | 0 | 0 |
Angola | 1 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... |
Venezuela | 0 | 0 | 0 | 0 | 0 | 1 |
Vietnam | 0 | 1 | 0 | 0 | 0 | 0 |
Yemen | 0 | 1 | 0 | 0 | 0 | 0 |
Zambia | 1 | 0 | 0 | 0 | 0 | 0 |
Zimbabwe | 1 | 0 | 0 | 0 | 0 | 0 |
193 rows × 6 columns
stocks = pd.read_csv('pandas/stocks.csv')
stocks
Date | Close | Volume | Symbol | |
---|---|---|---|---|
0 | 2016-10-03 | 31.50 | 14070500 | CSCO |
1 | 2016-10-03 | 112.52 | 21701800 | AAPL |
2 | 2016-10-03 | 57.42 | 19189500 | MSFT |
3 | 2016-10-04 | 113.00 | 29736800 | AAPL |
4 | 2016-10-04 | 57.24 | 20085900 | MSFT |
5 | 2016-10-04 | 31.35 | 18460400 | CSCO |
6 | 2016-10-05 | 57.64 | 16726400 | MSFT |
7 | 2016-10-05 | 31.59 | 11808600 | CSCO |
8 | 2016-10-05 | 113.05 | 21453100 | AAPL |
stocks.index
RangeIndex(start=0, stop=9, step=1)
ser = stocks.groupby(['Symbol', 'Date'])['Close'].mean()
ser
Symbol Date AAPL 2016-10-03 112.52 2016-10-04 113.00 2016-10-05 113.05 CSCO 2016-10-03 31.50 2016-10-04 31.35 2016-10-05 31.59 MSFT 2016-10-03 57.42 2016-10-04 57.24 2016-10-05 57.64 Name: Close, dtype: float64
ser.index
MultiIndex([('AAPL', '2016-10-03'), ('AAPL', '2016-10-04'), ('AAPL', '2016-10-05'), ('CSCO', '2016-10-03'), ('CSCO', '2016-10-04'), ('CSCO', '2016-10-05'), ('MSFT', '2016-10-03'), ('MSFT', '2016-10-04'), ('MSFT', '2016-10-05')], names=['Symbol', 'Date'])
ser.unstack()
Date | 2016-10-03 | 2016-10-04 | 2016-10-05 |
---|---|---|---|
Symbol | |||
AAPL | 112.52 | 113.00 | 113.05 |
CSCO | 31.50 | 31.35 | 31.59 |
MSFT | 57.42 | 57.24 | 57.64 |
stocks.set_index(['Symbol', 'Date'], inplace=True)
stocks
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
CSCO | 2016-10-03 | 31.50 | 14070500 |
AAPL | 2016-10-03 | 112.52 | 21701800 |
MSFT | 2016-10-03 | 57.42 | 19189500 |
AAPL | 2016-10-04 | 113.00 | 29736800 |
MSFT | 2016-10-04 | 57.24 | 20085900 |
CSCO | 2016-10-04 | 31.35 | 18460400 |
MSFT | 2016-10-05 | 57.64 | 16726400 |
CSCO | 2016-10-05 | 31.59 | 11808600 |
AAPL | 2016-10-05 | 113.05 | 21453100 |
stocks.index
MultiIndex([('CSCO', '2016-10-03'), ('AAPL', '2016-10-03'), ('MSFT', '2016-10-03'), ('AAPL', '2016-10-04'), ('MSFT', '2016-10-04'), ('CSCO', '2016-10-04'), ('MSFT', '2016-10-05'), ('CSCO', '2016-10-05'), ('AAPL', '2016-10-05')], names=['Symbol', 'Date'])
stocks.sort_index(inplace=True)
stocks
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
AAPL | 2016-10-03 | 112.52 | 21701800 |
2016-10-04 | 113.00 | 29736800 | |
2016-10-05 | 113.05 | 21453100 | |
CSCO | 2016-10-03 | 31.50 | 14070500 |
2016-10-04 | 31.35 | 18460400 | |
2016-10-05 | 31.59 | 11808600 | |
MSFT | 2016-10-03 | 57.42 | 19189500 |
2016-10-04 | 57.24 | 20085900 | |
2016-10-05 | 57.64 | 16726400 |
stocks.loc[('AAPL', '2016-10-03'), :]
Close 112.52 Volume 21701800.00 Name: (AAPL, 2016-10-03), dtype: float64
stocks.loc[('AAPL', '2016-10-03'), 'Close']
112.52
stocks.loc[['AAPL', 'MSFT'], :]
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
AAPL | 2016-10-03 | 112.52 | 21701800 |
2016-10-04 | 113.00 | 29736800 | |
2016-10-05 | 113.05 | 21453100 | |
MSFT | 2016-10-03 | 57.42 | 19189500 |
2016-10-04 | 57.24 | 20085900 | |
2016-10-05 | 57.64 | 16726400 |
stocks.loc[(['AAPL', 'MSFT'], '2016-10-03'), :]
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
AAPL | 2016-10-03 | 112.52 | 21701800 |
MSFT | 2016-10-03 | 57.42 | 19189500 |
stocks.loc[(['AAPL', 'MSFT'], '2016-10-03'), 'Close']
Symbol Date AAPL 2016-10-03 112.52 MSFT 2016-10-03 57.42 Name: Close, dtype: float64
stocks.loc[(slice(None), ['2016-10-03', '2016-10-04']), :]
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
AAPL | 2016-10-03 | 112.52 | 21701800 |
2016-10-04 | 113.00 | 29736800 | |
CSCO | 2016-10-03 | 31.50 | 14070500 |
2016-10-04 | 31.35 | 18460400 | |
MSFT | 2016-10-03 | 57.42 | 19189500 |
2016-10-04 | 57.24 | 20085900 |
df = stocks.pivot_table(values='Close', index='Symbol', columns='Date')
df
Date | 2016-10-03 | 2016-10-04 | 2016-10-05 |
---|---|---|---|
Symbol | |||
AAPL | 112.52 | 113.00 | 113.05 |
CSCO | 31.50 | 31.35 | 31.59 |
MSFT | 57.42 | 57.24 | 57.64 |
# read a dataset of movie reviewers into a DataFrame
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('pandas/movie.user', sep='|', header=None, names=user_cols, index_col='user_id')
users.head()
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
1 | 24 | M | technician | 85711 |
2 | 53 | F | other | 94043 |
3 | 23 | M | writer | 32067 |
4 | 24 | M | technician | 43537 |
5 | 33 | F | other | 15213 |
users.shape
(943, 4)
# detect duplicate zip codes: True if an item is identical to a previous item
users['zip_code'].duplicated().tail()
user_id 939 False 940 True 941 False 942 False 943 False Name: zip_code, dtype: bool
# count the duplicate items (True becomes 1, False becomes 0)
users['zip_code'].duplicated().sum()
148
# examine the duplicate rows (ignoring the first occurrence)
users.loc[users.duplicated(keep='first'), :]
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
496 | 21 | F | student | 55414 |
572 | 51 | M | educator | 20003 |
621 | 17 | M | student | 60402 |
684 | 28 | M | student | 55414 |
733 | 44 | F | other | 60630 |
805 | 27 | F | other | 20009 |
890 | 32 | M | student | 97301 |
# examine the duplicate rows (ignoring the last occurrence)
users.loc[users.duplicated(keep='last'), :]
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
67 | 17 | M | student | 60402 |
85 | 51 | M | educator | 20003 |
198 | 21 | F | student | 55414 |
350 | 32 | M | student | 97301 |
428 | 28 | M | student | 55414 |
437 | 27 | F | other | 20009 |
460 | 44 | F | other | 60630 |
# examine the duplicate rows (including all duplicates)
users.loc[users.duplicated(keep=False), :]
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
67 | 17 | M | student | 60402 |
85 | 51 | M | educator | 20003 |
198 | 21 | F | student | 55414 |
350 | 32 | M | student | 97301 |
428 | 28 | M | student | 55414 |
437 | 27 | F | other | 20009 |
460 | 44 | F | other | 60630 |
496 | 21 | F | student | 55414 |
572 | 51 | M | educator | 20003 |
621 | 17 | M | student | 60402 |
684 | 28 | M | student | 55414 |
733 | 44 | F | other | 60630 |
805 | 27 | F | other | 20009 |
890 | 32 | M | student | 97301 |
# drop the duplicate rows (inplace=False by default)
users.drop_duplicates(keep='first').shape
(936, 4)
users.drop_duplicates(keep='last').shape
(936, 4)
users.drop_duplicates(keep=False).shape
(929, 4)
# only consider a subset of columns when identifying duplicates
users.duplicated(subset=['age', 'zip_code']).sum()
16
users.drop_duplicates(subset=['age', 'zip_code']).shape
(927, 4)
ufo = pd.read_csv('pandas/ufo.csv')
ufo.tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | Grant Park | NaN | TRIANGLE | IL | 12/31/2000 23:00 |
18237 | Spirit Lake | NaN | DISK | IA | 12/31/2000 23:00 |
18238 | Eagle River | NaN | NaN | WI | 12/31/2000 23:45 |
18239 | Eagle River | RED | LIGHT | WI | 12/31/2000 23:45 |
18240 | Ybor | NaN | OVAL | FL | 12/31/2000 23:59 |
ufo.isna().tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | False | True | False | False | False |
18237 | False | True | False | False | False |
18238 | False | True | False | False | False |
18239 | False | False | False | False | False |
18240 | False | True | False | False | False |
ufo.notna().tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | True | False | True | True | True |
18237 | True | False | True | True | True |
18238 | True | False | True | True | True |
18239 | True | True | True | True | True |
18240 | True | False | True | True | True |
# count the number of missing values in each Series
ufo.isna().sum()
City 25 Colors Reported 15359 Shape Reported 0 State 0 Time 0 dtype: int64
# use the 'isnull' Series method to filter the DataFrame rows
ufo.loc[ufo.City.isnull()].head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
21 | NaN | NaN | NaN | LA | 8/15/1943 0:00 |
22 | NaN | NaN | LIGHT | LA | 8/15/1943 0:00 |
204 | NaN | NaN | DISK | CA | 7/15/1952 12:30 |
241 | NaN | BLUE | DISK | MT | 7/4/1953 14:00 |
613 | NaN | NaN | DISK | NV | 7/1/1960 12:00 |
# examine the number of rows and columns
ufo.shape
(18241, 5)
# if 'any' values are missing in a row, then drop that row
ufo.dropna(how='any').shape
(2486, 5)
# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily
ufo.shape
(18241, 5)
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.dropna(how='all').shape
(18241, 5)
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape
(15576, 5)
# 'value_counts' does not include missing values by default
ufo['Shape Reported'].value_counts().head()
LIGHT 2803 DISK 2122 TRIANGLE 1889 OTHER 1402 CIRCLE 1365 Name: Shape Reported, dtype: int64
# explicitly include missing values
ufo['Shape Reported'].value_counts(dropna=False).head()
LIGHT 2803 NaN 2644 DISK 2122 TRIANGLE 1889 OTHER 1402 Name: Shape Reported, dtype: int64
# fill in missing values with a specified value
ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)
# confirm that the missing values were filled in
ufo['Shape Reported'].value_counts().head()
VARIOUS 2977 LIGHT 2803 DISK 2122 TRIANGLE 1889 OTHER 1402 Name: Shape Reported, dtype: int64
# fill in missing values
ufo.fillna(value='UNKNOWN').head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | UNKNOWN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | UNKNOWN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | UNKNOWN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | UNKNOWN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | UNKNOWN | LIGHT | NY | 4/18/1933 19:00 |
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method='bfill').tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | Grant Park | RED | TRIANGLE | IL | 12/31/2000 23:00 |
18237 | Spirit Lake | RED | DISK | IA | 12/31/2000 23:00 |
18238 | Eagle River | RED | VARIOUS | WI | 12/31/2000 23:45 |
18239 | Eagle River | RED | LIGHT | WI | 12/31/2000 23:45 |
18240 | Ybor | NaN | OVAL | FL | 12/31/2000 23:59 |
# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method='ffill').tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | Grant Park | RED | TRIANGLE | IL | 12/31/2000 23:00 |
18237 | Spirit Lake | RED | DISK | IA | 12/31/2000 23:00 |
18238 | Eagle River | RED | VARIOUS | WI | 12/31/2000 23:45 |
18239 | Eagle River | RED | LIGHT | WI | 12/31/2000 23:45 |
18240 | Ybor | RED | OVAL | FL | 12/31/2000 23:59 |
train = pd.read_csv('pandas/titanic_train.csv')
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# map 'female' to 0 and 'male' to 1
train['Sex_num'] = train.Sex.map({'female': 0, 'male': 1})
train.loc[0:4, ['Sex', 'Sex_num']]
Sex | Sex_num | |
---|---|---|
0 | male | 1 |
1 | female | 0 |
2 | female | 0 |
3 | female | 0 |
4 | male | 1 |
# calculate the length of each string in the 'Name' Series
train['Name_length'] = train['Name'].apply(len)
train.loc[0:4, ['Name', 'Name_length']]
Name | Name_length | |
---|---|---|
0 | Braund, Mr. Owen Harris | 23 |
1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 51 |
2 | Heikkinen, Miss. Laina | 22 |
3 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 44 |
4 | Allen, Mr. William Henry | 24 |
# use a string method to split the 'Name' Series at commas (returns a Series of lists)
train['Name'].str.split(',').head()
0 [Braund, Mr. Owen Harris] 1 [Cumings, Mrs. John Bradley (Florence Briggs ... 2 [Heikkinen, Miss. Laina] 3 [Futrelle, Mrs. Jacques Heath (Lily May Peel)] 4 [Allen, Mr. William Henry] Name: Name, dtype: object
# use a lambda function
train['Name'].str.split(',').apply(lambda x: x[0]).head()
0 Braund 1 Cumings 2 Heikkinen 3 Futrelle 4 Allen Name: Name, dtype: object
drinks = pd.read_csv('pandas/drinks.csv')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# apply the 'max' function along axis 0 to calculate the maximum value in each column
drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=0)
beer_servings 376 spirit_servings 438 wine_servings 370 dtype: int64
# convert every DataFrame element into a float
drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float).head()
beer_servings | spirit_servings | wine_servings | |
---|---|---|---|
0 | 0.0 | 0.0 | 0.0 |
1 | 89.0 | 132.0 | 54.0 |
2 | 25.0 | 0.0 | 14.0 |
3 | 245.0 | 138.0 | 312.0 |
4 | 217.0 | 57.0 | 45.0 |
# overwrite the existing DataFrame columns
drinks.loc[:, 'beer_servings':
'wine_servings'] = drinks.loc[:, 'beer_servings':
'wine_servings'].applymap(float)
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0.0 | 0.0 | 0.0 | 0.0 | Asia |
1 | Albania | 89.0 | 132.0 | 54.0 | 4.9 | Europe |
2 | Algeria | 25.0 | 0.0 | 14.0 | 0.7 | Africa |
3 | Andorra | 245.0 | 138.0 | 312.0 | 12.4 | Europe |
4 | Angola | 217.0 | 57.0 | 45.0 | 5.9 | Africa |
# concatenate the 'drinks' DataFrame with the 'population' Series (aligns by the index)
pd.concat([drinks, people], axis=1).head()
movie_cols = ['movie_id', 'title']
movies = pd.read_table('pandas/u.item', sep='|', header=None, names=movie_cols, usecols=[0, 1])
movies.head()
movie_id | title | |
---|---|---|
0 | 1 | Toy Story (1995) |
1 | 2 | GoldenEye (1995) |
2 | 3 | Four Rooms (1995) |
3 | 4 | Get Shorty (1995) |
4 | 5 | Copycat (1995) |
movies.shape
(1682, 2)
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('pandas/u.data', sep='\t', header=None, names=rating_cols)
ratings.head()
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 196 | 242 | 3 | 881250949 |
1 | 186 | 302 | 3 | 891717742 |
2 | 22 | 377 | 1 | 878887116 |
3 | 244 | 51 | 2 | 880606923 |
4 | 166 | 346 | 1 | 886397596 |
ratings.shape
(100000, 4)
ratings.query("movie_id == 1").head()
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
24 | 308 | 1 | 4 | 887736532 |
454 | 287 | 1 | 5 | 875334088 |
957 | 148 | 1 | 4 | 877019411 |
971 | 280 | 1 | 4 | 891700426 |
1324 | 66 | 1 | 3 | 883601324 |
movie_ratings = pd.merge(movies, ratings)
movie_ratings.columns
Index(['movie_id', 'title', 'user_id', 'rating', 'timestamp'], dtype='object')
movie_ratings.head()
movie_id | title | user_id | rating | timestamp | |
---|---|---|---|---|---|
0 | 1 | Toy Story (1995) | 308 | 4 | 887736532 |
1 | 1 | Toy Story (1995) | 287 | 5 | 875334088 |
2 | 1 | Toy Story (1995) | 148 | 4 | 877019411 |
3 | 1 | Toy Story (1995) | 280 | 4 | 891700426 |
4 | 1 | Toy Story (1995) | 66 | 3 | 883601324 |
print(movies.shape)
print(ratings.shape)
print(movie_ratings.shape)
(1682, 2) (100000, 4) (100000, 5)
movies.columns = ['m_id', 'title']
movies.columns
Index(['m_id', 'title'], dtype='object')
ratings.columns
Index(['user_id', 'movie_id', 'rating', 'timestamp'], dtype='object')
pd.merge(movies, ratings, left_on='m_id', right_on='movie_id').head()
m_id | title | user_id | movie_id | rating | timestamp | |
---|---|---|---|---|---|---|
0 | 1 | Toy Story (1995) | 308 | 1 | 4 | 887736532 |
1 | 1 | Toy Story (1995) | 287 | 1 | 5 | 875334088 |
2 | 1 | Toy Story (1995) | 148 | 1 | 4 | 877019411 |
3 | 1 | Toy Story (1995) | 280 | 1 | 4 | 891700426 |
4 | 1 | Toy Story (1995) | 66 | 1 | 3 | 883601324 |
pd.merge(movies, ratings, left_index=True, right_on='movie_id').head()
title | user_id | movie_id | rating | timestamp | |
---|---|---|---|---|---|
24 | Toy Story (1995) | 308 | 1 | 4 | 887736532 |
454 | Toy Story (1995) | 287 | 1 | 5 | 875334088 |
957 | Toy Story (1995) | 148 | 1 | 4 | 877019411 |
971 | Toy Story (1995) | 280 | 1 | 4 | 891700426 |
1324 | Toy Story (1995) | 66 | 1 | 3 | 883601324 |
pd.merge(movies, ratings, left_index=True, right_index=True).head()
title | user_id | movie_id | rating | timestamp | |
---|---|---|---|---|---|
1 | Toy Story (1995) | 186 | 302 | 3 | 891717742 |
2 | GoldenEye (1995) | 22 | 377 | 1 | 878887116 |
3 | Four Rooms (1995) | 244 | 51 | 2 | 880606923 |
4 | Get Shorty (1995) | 166 | 346 | 1 | 886397596 |
5 | Copycat (1995) | 298 | 474 | 4 | 884182806 |
close = pd.read_csv('pandas/stocks.csv', usecols=[0, 1, 3], index_col=['Symbol', 'Date']).sort_index()
close
Close | ||
---|---|---|
Symbol | Date | |
AAPL | 2016-10-03 | 112.52 |
2016-10-04 | 113.00 | |
2016-10-05 | 113.05 | |
CSCO | 2016-10-03 | 31.50 |
2016-10-04 | 31.35 | |
2016-10-05 | 31.59 | |
MSFT | 2016-10-03 | 57.42 |
2016-10-04 | 57.24 | |
2016-10-05 | 57.64 |
volume = pd.read_csv('pandas/stocks.csv', usecols=[0, 2, 3], index_col=['Symbol', 'Date']).sort_index()
volume
Volume | ||
---|---|---|
Symbol | Date | |
AAPL | 2016-10-03 | 21701800 |
2016-10-04 | 29736800 | |
2016-10-05 | 21453100 | |
CSCO | 2016-10-03 | 14070500 |
2016-10-04 | 18460400 | |
2016-10-05 | 11808600 | |
MSFT | 2016-10-03 | 19189500 |
2016-10-04 | 20085900 | |
2016-10-05 | 16726400 |
both = pd.merge(close, volume, left_index=True, right_index=True)
both
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
AAPL | 2016-10-03 | 112.52 | 21701800 |
2016-10-04 | 113.00 | 29736800 | |
2016-10-05 | 113.05 | 21453100 | |
CSCO | 2016-10-03 | 31.50 | 14070500 |
2016-10-04 | 31.35 | 18460400 | |
2016-10-05 | 31.59 | 11808600 | |
MSFT | 2016-10-03 | 57.42 | 19189500 |
2016-10-04 | 57.24 | 20085900 | |
2016-10-05 | 57.64 | 16726400 |
both.reset_index()
Symbol | Date | Close | Volume | |
---|---|---|---|---|
0 | AAPL | 2016-10-03 | 112.52 | 21701800 |
1 | AAPL | 2016-10-04 | 113.00 | 29736800 |
2 | AAPL | 2016-10-05 | 113.05 | 21453100 |
3 | CSCO | 2016-10-03 | 31.50 | 14070500 |
4 | CSCO | 2016-10-04 | 31.35 | 18460400 |
5 | CSCO | 2016-10-05 | 31.59 | 11808600 |
6 | MSFT | 2016-10-03 | 57.42 | 19189500 |
7 | MSFT | 2016-10-04 | 57.24 | 20085900 |
8 | MSFT | 2016-10-05 | 57.64 | 16726400 |
A = pd.DataFrame({'color': ['green', 'yellow', 'red'], 'num': [1, 2, 3]})
A
color | num | |
---|---|---|
0 | green | 1 |
1 | yellow | 2 |
2 | red | 3 |
B = pd.DataFrame({'color': ['green', 'yellow', 'pink'], 'size':['S', 'M', 'L']})
B
color | size | |
---|---|---|
0 | green | S |
1 | yellow | M |
2 | pink | L |
Inner join: Only include observations found in both A and B:
pd.merge(A, B, how='inner')
color | num | size | |
---|---|---|---|
0 | green | 1 | S |
1 | yellow | 2 | M |
Outer join: Include observations found in either A or B:
pd.merge(A, B, how='outer')
color | num | size | |
---|---|---|---|
0 | green | 1.0 | S |
1 | yellow | 2.0 | M |
2 | red | 3.0 | NaN |
3 | pink | NaN | L |
Left join: Include all observations found in A:
pd.merge(A, B, how='left')
color | num | size | |
---|---|---|---|
0 | green | 1 | S |
1 | yellow | 2 | M |
2 | red | 3 | NaN |
Right join: Include all observations found in B:
pd.merge(A, B, how='right')
color | num | size | |
---|---|---|---|
0 | green | 1.0 | S |
1 | yellow | 2.0 | M |
2 | pink | NaN | L |
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('drinks.csv')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# examine the data type of each Series
drinks.dtypes
country object beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
# change the data type of an existing Series
drinks['beer_servings'] = drinks.beer_servings.astype(float)
drinks.dtypes
country object beer_servings float64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
# alternatively, change the data type of a Series while reading in a file
drinks = pd.read_csv('drinks.csv', dtype={'beer_servings': float})
drinks.dtypes
country object beer_servings float64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent object dtype: object
# read a dataset of Chipotle orders into a DataFrame
orders = pd.read_table('chipotle.tsv')
orders.head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | Chips and Fresh Tomato Salsa | NaN | $2.39 |
1 | 1 | 1 | Izze | [Clementine] | $3.39 |
2 | 1 | 1 | Nantucket Nectar | [Apple] | $3.39 |
3 | 1 | 1 | Chips and Tomatillo-Green Chili Salsa | NaN | $2.39 |
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
# string methods for pandas Series are accessed via 'str'
orders.item_name.str.upper().head()
0 CHIPS AND FRESH TOMATO SALSA 1 IZZE 2 NANTUCKET NECTAR 3 CHIPS AND TOMATILLO-GREEN CHILI SALSA 4 CHICKEN BOWL Name: item_name, dtype: object
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()
0 False 1 False 2 False 3 False 4 True Name: item_name, dtype: bool
# use the boolean Series to filter the DataFrame
orders[orders.item_name.str.contains('Chicken')].head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
5 | 3 | 1 | Chicken Bowl | [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou... | $10.98 |
11 | 6 | 1 | Chicken Crispy Tacos | [Roasted Chili Corn Salsa, [Fajita Vegetables,... | $8.75 |
12 | 6 | 1 | Chicken Soft Tacos | [Roasted Chili Corn Salsa, [Rice, Black Beans,... | $8.75 |
13 | 7 | 1 | Chicken Bowl | [Fresh Tomato Salsa, [Fajita Vegetables, Rice,... | $11.25 |
# string methods can be chained together
orders.choice_description.str.replace('[', '').str.replace(']', '').head()
/var/folders/93/795zm8c93m16_92qkk86t0_r0000gn/T/ipykernel_64526/7676934.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True. orders.choice_description.str.replace('[', '').str.replace(']', '').head()
0 NaN 1 Clementine 2 Apple 3 NaN 4 Tomatillo-Red Chili Salsa (Hot), Black Beans, ... Name: choice_description, dtype: object
# many pandas string methods support regular expressions (regex)
orders.choice_description.str.replace('[\[\]]', '').head()
/var/folders/93/795zm8c93m16_92qkk86t0_r0000gn/T/ipykernel_64526/1086997848.py:2: FutureWarning: The default value of regex will change from True to False in a future version. orders.choice_description.str.replace('[\[\]]', '').head()
0 NaN 1 Clementine 2 Apple 3 NaN 4 Tomatillo-Red Chili Salsa (Hot), Black Beans, ... Name: choice_description, dtype: object
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('pandas/ufo.csv')
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# 'Time' is currently stored as a string
ufo.dtypes
City object Colors Reported object Shape Reported object State object Time object dtype: object
# convert 'Time' to datetime format
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 1930-06-01 22:00:00 |
1 | Willingboro | NaN | OTHER | NJ | 1930-06-30 20:00:00 |
2 | Holyoke | NaN | OVAL | CO | 1931-02-15 14:00:00 |
3 | Abilene | NaN | DISK | KS | 1931-06-01 13:00:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 1933-04-18 19:00:00 |
ufo.dtypes
City object Colors Reported object Shape Reported object State object Time datetime64[ns] dtype: object
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('drinks.csv')
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# calculate the memory usage for each Series (in bytes)
drinks.memory_usage(deep=True)
Index 128 country 12588 beer_servings 1544 spirit_servings 1544 wine_servings 1544 total_litres_of_pure_alcohol 1544 continent 12332 dtype: int64
# use the 'category' data type to store the 'continent' strings as integers
drinks['continent'] = drinks.continent.astype('category')
drinks.dtypes
country object beer_servings int64 spirit_servings int64 wine_servings int64 total_litres_of_pure_alcohol float64 continent category dtype: object
# 'continent' Series appears to be unchanged
drinks.continent.head()
0 Asia 1 Europe 2 Africa 3 Europe 4 Africa Name: continent, dtype: category Categories (6, object): ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']
# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks.continent.cat.codes.head()
0 1 1 2 2 0 3 2 4 0 dtype: int8
# memory usage has been drastically reduced
drinks.memory_usage(deep=True)
Index 128 country 12588 beer_servings 1544 spirit_servings 1544 wine_servings 1544 total_litres_of_pure_alcohol 1544 continent 756 dtype: int64
# repeat this process for the 'country' Series
drinks['country'] = drinks.country.astype('category')
drinks.memory_usage(deep=True)
Index 128 country 17142 beer_servings 1544 spirit_servings 1544 wine_servings 1544 total_litres_of_pure_alcohol 1544 continent 756 dtype: int64
# memory usage increased because we created 193 categories
drinks.country.cat.categories
train = pd.read_csv('pandas/titanic_train.csv')
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# use 'get_dummies' to create one column for every possible value
pd.get_dummies(train.Sex).head()
female | male | |
---|---|---|
0 | 0 | 1 |
1 | 1 | 0 |
2 | 1 | 0 |
3 | 1 | 0 |
4 | 0 | 1 |
# drop the first dummy variable ('female') using the 'iloc' method
# add a prefix to identify the source of the dummy variables
pd.get_dummies(train.Sex, prefix='Sex').iloc[:, 1:].head()
Sex_male | |
---|---|
0 | 1 |
1 | 0 |
2 | 0 |
3 | 0 |
4 | 1 |
# use 'get_dummies' with a feature that has 3 possible values
pd.get_dummies(train.Embarked, prefix='Embarked').head(10)
Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|
0 | 0 | 0 | 1 |
1 | 1 | 0 | 0 |
2 | 0 | 0 | 1 |
3 | 0 | 0 | 1 |
4 | 0 | 0 | 1 |
5 | 0 | 1 | 0 |
6 | 0 | 0 | 1 |
7 | 0 | 0 | 1 |
8 | 0 | 0 | 1 |
9 | 1 | 0 | 0 |
# drop the first dummy variable ('C')
pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:].head(10)
Embarked_Q | Embarked_S | |
---|---|---|
0 | 0 | 1 |
1 | 0 | 0 |
2 | 0 | 1 |
3 | 0 | 1 |
4 | 0 | 1 |
5 | 1 | 0 |
6 | 0 | 1 |
7 | 0 | 1 |
8 | 0 | 1 |
9 | 0 | 0 |
# save the DataFrame of dummy variables and concatenate them to the original DataFrame
embarked_dummies = pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:]
train = pd.concat([train, embarked_dummies], axis=1)
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 0 | 1 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | 0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 | 1 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 0 | 1 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 0 | 1 |
# reset the DataFrame
train = pd.read_csv('pandas/titanic_train.csv')
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
pd.get_dummies(train, columns=['Sex', 'Embarked']).head()
PassengerId | Survived | Pclass | Name | Age | SibSp | Parch | Ticket | Fare | Cabin | Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | 0 | 1 | 0 | 0 | 1 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | 1 | 0 | 1 | 0 | 0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | 1 | 0 | 0 | 0 | 1 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | 1 | 0 | 0 | 0 | 1 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | 0 | 1 | 0 | 0 | 1 |
# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True).head()
PassengerId | Survived | Pclass | Name | Age | SibSp | Parch | Ticket | Fare | Cabin | Sex_male | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | 1 | 0 | 1 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | 0 | 0 | 0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | 0 | 0 | 1 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | 0 | 0 | 1 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | 1 | 0 | 1 |
# create a small DataFrame from a dictionary
df = pd.DataFrame({
'ID': [100, 101, 102, 103],
'quality': ['good', 'very good', 'good', 'excellent']
})
df
ID | quality | |
---|---|---|
0 | 100 | good |
1 | 101 | very good |
2 | 102 | good |
3 | 103 | excellent |
# sort the DataFrame by the 'quality' Series (alphabetical order)
df.sort_values('quality')
ID | quality | |
---|---|---|
3 | 103 | excellent |
0 | 100 | good |
2 | 102 | good |
1 | 101 | very good |
from pandas.api.types import CategoricalDtype
# define a logical ordering for the categories
cats = ['good', 'very good', 'excellent']
cat_type = CategoricalDtype(categories=cats, ordered=True)
df['quality'] = df.quality.astype(cat_type)
df.quality
0 good 1 very good 2 good 3 excellent Name: quality, dtype: category Categories (3, object): ['good' < 'very good' < 'excellent']
# sort the DataFrame by the 'quality' Series (logical order)
df.sort_values('quality')
ID | quality | |
---|---|---|
0 | 100 | good |
2 | 102 | good |
1 | 101 | very good |
3 | 103 | excellent |
# comparison operators work with ordered categories
df.loc[df.quality > 'good', :]
ID | quality | |
---|---|---|
1 | 101 | very good |
3 | 103 | excellent |
# use the 'random_state' parameter for reproducibility
ufo.sample(n=3, random_state=42)
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
217 | Norridgewock | NaN | DISK | ME | 9/15/1952 14:00 |
12282 | Ipava | NaN | TRIANGLE | IL | 10/1/1998 21:15 |
17933 | Ellinwood | NaN | FIREBALL | KS | 11/13/2000 22:00 |
# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(frac=0.75, random_state=99)
# store the remaining 25% of the rows in another DataFrame
test = ufo.loc[~ufo.index.isin(train.index), :]
test.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
5 | Valley City | NaN | DISK | ND | 9/15/1934 15:30 |
8 | Eklutna | NaN | CIGAR | AK | 10/15/1936 17:00 |
11 | Waterloo | NaN | FIREBALL | AL | 6/1/1939 20:00 |
13 | Keokuk | NaN | OVAL | IA | 7/7/1939 2:00 |
# https://stackoverflow.com/questions/44835358/pandas-list-of-dataframe-names
g = globals()
df_names = [k for k, v in g.items() if isinstance(v, pd.DataFrame)]
df_list = list(filter(lambda x: 'df' in x, df_names))
df_list
Running cells with '/usr/local/bin/python3.11' requires the ipykernel package. Run the following command to install 'ipykernel' into the Python environment. Command: '/usr/local/bin/python3.11 -m pip install ipykernel -U --user --force-reinstall'