import pandas as pd
import numpy as np
df = pd.read_csv("https://github.com/chris1610/pbpython/blob/master/data/sales_data_types.csv?raw=True")
df
Customer Number | Customer Name | 2016 | 2017 | Percent Growth | Jan Units | Month | Day | Year | Active | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 10002.0 | Quest Industries | $125,000.00 | $162500.00 | 30.00% | 500 | 1 | 10 | 2015 | Y |
1 | 552278.0 | Smith Plumbing | $920,000.00 | $101,2000.00 | 10.00% | 700 | 6 | 15 | 2014 | Y |
2 | 23477.0 | ACME Industrial | $50,000.00 | $62500.00 | 25.00% | 125 | 3 | 29 | 2016 | Y |
3 | 24900.0 | Brekke LTD | $350,000.00 | $490000.00 | 4.00% | 75 | 10 | 27 | 2015 | Y |
4 | 651029.0 | Harbor Co | $15,000.00 | $12750.00 | -15.00% | Closed | 2 | 2 | 2014 | N |
Use df.info and df.dtypes to look at the types that pandas automatically infers based on the data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5 entries, 0 to 4 Data columns (total 10 columns): Customer Number 5 non-null float64 Customer Name 5 non-null object 2016 5 non-null object 2017 5 non-null object Percent Growth 5 non-null object Jan Units 5 non-null object Month 5 non-null int64 Day 5 non-null int64 Year 5 non-null int64 Active 5 non-null object dtypes: float64(1), int64(3), object(6) memory usage: 480.0+ bytes
df.dtypes
Since the 2016 and 2017 columns were read in as objects, trying to add the values will result in string concatenation not numerical addition
df['2016'] + df['2017']
0 $125,000.00$162500.00 1 $920,000.00$101,2000.00 2 $50,000.00$62500.00 3 $350,000.00$490000.00 4 $15,000.00$12750.00 dtype: object
The simplest way to to convert to a type is using astype.
We can apply it to the customer number first.
df['Customer Number'].astype('int')
0 10002 1 552278 2 23477 3 24900 4 651029 Name: Customer Number, dtype: int64
The code above does not alter the original dataframe
df.dtypes
Customer Number float64 Customer Name object 2016 object 2017 object Percent Growth object Jan Units object Month int64 Day int64 Year int64 Active object dtype: object
Assign the new integer customer number back to the original frame and check the type
df["Customer Number"] = df['Customer Number'].astype('int')
df.dtypes
Customer Number int64 Customer Name object 2016 object 2017 object Percent Growth object Jan Units object Month int64 Day int64 Year int64 Active object dtype: object
df
Customer Number | Customer Name | 2016 | 2017 | Percent Growth | Jan Units | Month | Day | Year | Active | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 10002 | Quest Industries | $125,000.00 | $162500.00 | 30.00% | 500 | 1 | 10 | 2015 | Y |
1 | 552278 | Smith Plumbing | $920,000.00 | $101,2000.00 | 10.00% | 700 | 6 | 15 | 2014 | Y |
2 | 23477 | ACME Industrial | $50,000.00 | $62500.00 | 25.00% | 125 | 3 | 29 | 2016 | Y |
3 | 24900 | Brekke LTD | $350,000.00 | $490000.00 | 4.00% | 75 | 10 | 27 | 2015 | Y |
4 | 651029 | Harbor Co | $15,000.00 | $12750.00 | -15.00% | Closed | 2 | 2 | 2014 | N |
The data all looks good for the Customer Number.
If we try to convert the Jan Units column, we will get an error.
df['Jan Units'].astype('int')
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-10-31333711e4a4> in <module>() ----> 1 df['Jan Units'].astype('int') ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs) 116 else: 117 kwargs[new_arg_name] = new_arg_value --> 118 return func(*args, **kwargs) 119 return wrapper 120 return _deprecate_kwarg ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs) 4002 # else, only a single dtype is given 4003 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, -> 4004 **kwargs) 4005 return self._constructor(new_data).__finalize__(self) 4006 ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs) 3460 3461 def astype(self, dtype, **kwargs): -> 3462 return self.apply('astype', dtype=dtype, **kwargs) 3463 3464 def convert(self, **kwargs): ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs) 3327 3328 kwargs['mgr'] = self -> 3329 applied = getattr(b, f)(**kwargs) 3330 result_blocks = _extend_blocks(applied, result_blocks) 3331 ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, copy, errors, values, **kwargs) 542 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): 543 return self._astype(dtype, copy=copy, errors=errors, values=values, --> 544 **kwargs) 545 546 def _astype(self, dtype, copy=False, errors='raise', values=None, ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs) 623 624 # _astype_nansafe works fine with 1-d only --> 625 values = astype_nansafe(values.ravel(), dtype, copy=True) 626 values = values.reshape(self.shape) 627 ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy) 690 elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): 691 # work around NumPy brokenness, #1987 --> 692 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) 693 694 if dtype.name in ("datetime64", "timedelta64"): pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe() pandas/_libs/src/util.pxd in util.set_value_at_unsafe() ValueError: invalid literal for int() with base 10: 'Closed'
In a similar manner we get an error if we try to convert the sales column
df['2016'].astype('float')
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-11-999869d577b0> in <module>() ----> 1 df['2016'].astype('float') ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs) 116 else: 117 kwargs[new_arg_name] = new_arg_value --> 118 return func(*args, **kwargs) 119 return wrapper 120 return _deprecate_kwarg ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs) 4002 # else, only a single dtype is given 4003 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, -> 4004 **kwargs) 4005 return self._constructor(new_data).__finalize__(self) 4006 ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs) 3460 3461 def astype(self, dtype, **kwargs): -> 3462 return self.apply('astype', dtype=dtype, **kwargs) 3463 3464 def convert(self, **kwargs): ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs) 3327 3328 kwargs['mgr'] = self -> 3329 applied = getattr(b, f)(**kwargs) 3330 result_blocks = _extend_blocks(applied, result_blocks) 3331 ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, copy, errors, values, **kwargs) 542 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): 543 return self._astype(dtype, copy=copy, errors=errors, values=values, --> 544 **kwargs) 545 546 def _astype(self, dtype, copy=False, errors='raise', values=None, ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs) 623 624 # _astype_nansafe works fine with 1-d only --> 625 values = astype_nansafe(values.ravel(), dtype, copy=True) 626 values = values.reshape(self.shape) 627 ~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy) 701 702 if copy: --> 703 return arr.astype(dtype) 704 return arr.view(dtype) 705 ValueError: could not convert string to float: '$15,000.00'
We can try to use astype with a bool type but that does not give expected results
df['Active'].astype('bool')
0 True 1 True 2 True 3 True 4 True Name: Active, dtype: bool
# astype can take a dictionary of column names and data types
df.astype({'Customer Number': 'int', 'Customer Name': 'str'}).dtypes
Customer Number int64 Customer Name object 2016 object 2017 object Percent Growth object Jan Units object Month int64 Day int64 Year int64 Active object dtype: object
In order to convert the currency and percentages, we need to use custom functions
def convert_currency(val):
"""
Convert the string number value to a float
- Remove $
- Remove commas
- Convert to float type
"""
new_val = val.replace(',','').replace('$', '')
return float(new_val)
def convert_percent(val):
"""
Convert the percentage string to an actual floating point percent
"""
new_val = val.replace('%', '')
return float(new_val) / 100
Use apply to convert the 2016 and 2017 columns to floating point numbers
df['2016'].apply(convert_currency)
0 125000.0 1 920000.0 2 50000.0 3 350000.0 4 15000.0 Name: 2016, dtype: float64
df['2017'].apply(convert_currency)
0 162500.0 1 1012000.0 2 62500.0 3 490000.0 4 12750.0 Name: 2017, dtype: float64
We could use a lambda function as well but it may be more difficult for new users to understand
df['2016'].apply(lambda x: x.replace('$', '').replace(',', '')).astype('float')
0 125000.0 1 920000.0 2 50000.0 3 350000.0 4 15000.0 Name: 2016, dtype: float64
# Assign the converted values back to the columns
df['2016'] = df['2016'].apply(convert_currency)
df['2017'] = df['2017'].apply(convert_currency)
Use a lambda function to convert the percentage strings to numbers
df['Percent Growth'].apply(lambda x: x.replace('%', '')).astype('float') / 100
0 0.30 1 0.10 2 0.25 3 0.04 4 -0.15 Name: Percent Growth, dtype: float64
df['Percent Growth'] = df['Percent Growth'].apply(convert_percent)
df.dtypes
Customer Number int64 Customer Name object 2016 float64 2017 float64 Percent Growth float64 Jan Units object Month int64 Day int64 Year int64 Active object dtype: object
# Let's look at the data so far
df
Customer Number | Customer Name | 2016 | 2017 | Percent Growth | Jan Units | Month | Day | Year | Active | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 10002 | Quest Industries | 125000.0 | 162500.0 | 0.30 | 500 | 1 | 10 | 2015 | Y |
1 | 552278 | Smith Plumbing | 920000.0 | 1012000.0 | 0.10 | 700 | 6 | 15 | 2014 | Y |
2 | 23477 | ACME Industrial | 50000.0 | 62500.0 | 0.25 | 125 | 3 | 29 | 2016 | Y |
3 | 24900 | Brekke LTD | 350000.0 | 490000.0 | 0.04 | 75 | 10 | 27 | 2015 | Y |
4 | 651029 | Harbor Co | 15000.0 | 12750.0 | -0.15 | Closed | 2 | 2 | 2014 | N |
pd.to_numeric is another option for handling column conversions when invalid values are included
pd.to_numeric(df['Jan Units'], errors='coerce')
0 500.0 1 700.0 2 125.0 3 75.0 4 NaN Name: Jan Units, dtype: float64
# Fill in the NaN with 0
pd.to_numeric(df['Jan Units'], errors='coerce').fillna(0)
0 500.0 1 700.0 2 125.0 3 75.0 4 0.0 Name: Jan Units, dtype: float64
Make sure to populate the original column of data
df["Jan Units"] = pd.to_numeric(df['Jan Units'], errors='coerce').fillna(0)
pd.to_datetime is very useful for working with date conversions
pd.to_datetime(df[['Month', 'Day', 'Year']])
0 2015-01-10 1 2014-06-15 2 2016-03-29 3 2015-10-27 4 2014-02-02 dtype: datetime64[ns]
df["Start_Date"] = pd.to_datetime(df[['Month', 'Day', 'Year']])
# Check out the dataframe
df
Customer Number | Customer Name | 2016 | 2017 | Percent Growth | Jan Units | Month | Day | Year | Active | Start_Date | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10002 | Quest Industries | 125000.0 | 162500.0 | 0.30 | 500.0 | 1 | 10 | 2015 | Y | 2015-01-10 |
1 | 552278 | Smith Plumbing | 920000.0 | 1012000.0 | 0.10 | 700.0 | 6 | 15 | 2014 | Y | 2014-06-15 |
2 | 23477 | ACME Industrial | 50000.0 | 62500.0 | 0.25 | 125.0 | 3 | 29 | 2016 | Y | 2016-03-29 |
3 | 24900 | Brekke LTD | 350000.0 | 490000.0 | 0.04 | 75.0 | 10 | 27 | 2015 | Y | 2015-10-27 |
4 | 651029 | Harbor Co | 15000.0 | 12750.0 | -0.15 | 0.0 | 2 | 2 | 2014 | N | 2014-02-02 |
Use np.where to convert the active column to a boolean
df["Active"] = np.where(df["Active"] == "Y", True, False)
df
Customer Number | Customer Name | 2016 | 2017 | Percent Growth | Jan Units | Month | Day | Year | Active | Start_Date | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10002 | Quest Industries | 125000.0 | 162500.0 | 0.30 | 500.0 | 1 | 10 | 2015 | True | 2015-01-10 |
1 | 552278 | Smith Plumbing | 920000.0 | 1012000.0 | 0.10 | 700.0 | 6 | 15 | 2014 | True | 2014-06-15 |
2 | 23477 | ACME Industrial | 50000.0 | 62500.0 | 0.25 | 125.0 | 3 | 29 | 2016 | True | 2016-03-29 |
3 | 24900 | Brekke LTD | 350000.0 | 490000.0 | 0.04 | 75.0 | 10 | 27 | 2015 | True | 2015-10-27 |
4 | 651029 | Harbor Co | 15000.0 | 12750.0 | -0.15 | 0.0 | 2 | 2 | 2014 | False | 2014-02-02 |
df.dtypes
Customer Number int64 Customer Name object 2016 float64 2017 float64 Percent Growth float64 Jan Units float64 Month int64 Day int64 Year int64 Active bool Start_Date datetime64[ns] dtype: object
Many of the examples shown above can be used when reading in data using dtypes or converters arguments
df_2 = pd.read_csv("https://github.com/chris1610/pbpython/blob/master/data/sales_data_types.csv?raw=True",
dtype={'Customer Number':'int'},
converters={'2016':convert_currency,
'2017': convert_currency,
'Percent Growth': convert_percent,
'Jan Units': lambda x: pd.to_numeric(x, errors='coerce'),
'Active': lambda x: np.where(x == "Y", True, False)
})
df_2.dtypes
Customer Number int64 Customer Name object 2016 float64 2017 float64 Percent Growth float64 Jan Units float64 Month int64 Day int64 Year int64 Active object dtype: object
df_2
Customer Number | Customer Name | 2016 | 2017 | Percent Growth | Jan Units | Month | Day | Year | Active | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 10002 | Quest Industries | 125000.0 | 162500.0 | 0.30 | 500.0 | 1 | 10 | 2015 | True |
1 | 552278 | Smith Plumbing | 920000.0 | 1012000.0 | 0.10 | 700.0 | 6 | 15 | 2014 | True |
2 | 23477 | ACME Industrial | 50000.0 | 62500.0 | 0.25 | 125.0 | 3 | 29 | 2016 | True |
3 | 24900 | Brekke LTD | 350000.0 | 490000.0 | 0.04 | 75.0 | 10 | 27 | 2015 | True |
4 | 651029 | Harbor Co | 15000.0 | 12750.0 | -0.15 | NaN | 2 | 2 | 2014 | False |
# This can not be applied at the time the data is read in
df_2["Start_Date"] = pd.to_datetime(df_2[['Month', 'Day', 'Year']])
df_2
Customer Number | Customer Name | 2016 | 2017 | Percent Growth | Jan Units | Month | Day | Year | Active | Start_Date | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10002 | Quest Industries | 125000.0 | 162500.0 | 0.30 | 500.0 | 1 | 10 | 2015 | True | 2015-01-10 |
1 | 552278 | Smith Plumbing | 920000.0 | 1012000.0 | 0.10 | 700.0 | 6 | 15 | 2014 | True | 2014-06-15 |
2 | 23477 | ACME Industrial | 50000.0 | 62500.0 | 0.25 | 125.0 | 3 | 29 | 2016 | True | 2016-03-29 |
3 | 24900 | Brekke LTD | 350000.0 | 490000.0 | 0.04 | 75.0 | 10 | 27 | 2015 | True | 2015-10-27 |
4 | 651029 | Harbor Co | 15000.0 | 12750.0 | -0.15 | NaN | 2 | 2 | 2014 | False | 2014-02-02 |