import pandas as pd
import os
print pd.__version__
0.20.3
df = pd.read_table(
"http://bit.ly/movieusers",
sep="|",
header=None,
names=["user_id", "age", "gender", "occupation", "zipcode"],
index_col="user_id"
)
df.head(5)
age | gender | occupation | zipcode | |
---|---|---|---|---|
user_id | ||||
1 | 24 | M | technician | 85711 |
2 | 53 | F | other | 94043 |
3 | 23 | M | writer | 32067 |
4 | 24 | M | technician | 43537 |
5 | 33 | F | other | 15213 |
df.shape
(943, 4)
# number of rows that are duplicate
df.duplicated().sum()
7
# we are trying to location of duplicated elements
df.loc[df.duplicated()]
age | gender | occupation | zipcode | |
---|---|---|---|---|
user_id | ||||
496 | 21 | F | student | 55414 |
572 | 51 | M | educator | 20003 |
621 | 17 | M | student | 60402 |
684 | 28 | M | student | 55414 |
733 | 44 | F | other | 60630 |
805 | 27 | F | other | 20009 |
890 | 32 | M | student | 97301 |
# but if you notice, we can just locate one of the occurence that was duplicate of some other.
# keep=False let's you see all the utterences
df.loc[df.duplicated(keep=False)]
age | gender | occupation | zipcode | |
---|---|---|---|---|
user_id | ||||
67 | 17 | M | student | 60402 |
85 | 51 | M | educator | 20003 |
198 | 21 | F | student | 55414 |
350 | 32 | M | student | 97301 |
428 | 28 | M | student | 55414 |
437 | 27 | F | other | 20009 |
460 | 44 | F | other | 60630 |
496 | 21 | F | student | 55414 |
572 | 51 | M | educator | 20003 |
621 | 17 | M | student | 60402 |
684 | 28 | M | student | 55414 |
733 | 44 | F | other | 60630 |
805 | 27 | F | other | 20009 |
890 | 32 | M | student | 97301 |
# we started with (943, 4) and now it is (936, 4)
df.drop_duplicates().shape
print
print "943-7 = 936"
943-7 = 936
df = pd.read_csv(
"http://bit.ly/uforeports"
)
df.head(5)
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# anything that has a dtype as object in pandas is stored as a string
df.dtypes
City object Colors Reported object Shape Reported object State object Time object dtype: object
df.Time = pd.to_datetime(df.Time)
df.head(5)
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 1930-06-01 22:00:00 |
1 | Willingboro | NaN | OTHER | NJ | 1930-06-30 20:00:00 |
2 | Holyoke | NaN | OVAL | CO | 1931-02-15 14:00:00 |
3 | Abilene | NaN | DISK | KS | 1931-06-01 13:00:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 1933-04-18 19:00:00 |
df.dtypes
City object Colors Reported object Shape Reported object State object Time datetime64[ns] dtype: object
df.Time.dt.date.head(2)
0 1930-06-01 1 1930-06-30 Name: Time, dtype: object
df.Time.dt.month.head(2)
0 6 1 6 Name: Time, dtype: int64