Iris Data¶

📢: This document was used during early development of siuba. See the select docs.

Many different ways of selecting columns from the iris dataset.

In [1]:

from siuba import *
import pandas as pd

pd.set_option('max_rows', 5)

In [2]:

## Rather than import the iris data from sklearn, I am just including the
## first 5 rows.

# from sklearn import datasets
# iris = datasets.load_iris()
# df_iris = pd.DataFrame(iris.data, columns = iris.feature_names)
# df_iris['species'] = iris.target_names[iris.target]

df_iris = pd.DataFrame({
    'sepal length (cm)': [5.1, 4.9, 4.7, 4.6, 5.0],
    'sepal width (cm)': [3.5, 3.0, 3.2, 3.1, 3.6],
    'petal length (cm)': [1.4, 1.4, 1.3, 1.5, 1.4],
    'petal width (cm)': [0.2, 0.2, 0.2, 0.2, 0.2],
    'species': ['setosa', 'setosa', 'setosa', 'setosa', 'setosa']
})

In [3]:

# get sepal columns
select(df_iris, _.startswith("sepal"))

# get width measure columns
# note method calls sent to df_iris.columns.str
# so _.endswith("...") is equivalent to df_iris.columns.str.endswith("...")
select(df_iris, _.endswith("width (cm)"))

# movie species to front
# _.endswith("") is a hack to get everything
select(df_iris, _.species, _.endswith(""))

# move sepal length to the back
# first select all variables except Sepal.Length, then re select Sepal.Length
select(df_iris, -_["sepal length (cm)"], _["sepal length (cm)"])

Out[3]:

	sepal width (cm)	petal length (cm)	petal width (cm)	species	sepal length (cm)
0	3.5	1.4	0.2	setosa	5.1
1	3.0	1.4	0.2	setosa	4.9
2	3.2	1.3	0.2	setosa	4.7
3	3.1	1.5	0.2	setosa	4.6
4	3.6	1.4	0.2	setosa	5.0

Wide table¶

In [4]:

import numpy as np
from numpy.random import uniform, seed

seed(123)

df = pd.DataFrame(uniform(size = [10, 10]))
df = df[np.array([3, 4, 7, 1, 9, 8, 5, 2, 6, 10]) - 1]
df.columns = "V" + df.columns.astype(str)
df

Out[4]:

	V2	V3	V6	V0	V8	V7	V4	V1	V5	V9
0	0.226851	0.551315	0.980764	0.696469	0.480932	0.684830	0.719469	0.286139	0.423106	0.392118
1	0.438572	0.059678	0.182492	0.343178	0.531551	0.175452	0.398044	0.729050	0.737995	0.531828
...	...	...	...	...	...	...	...	...	...	...
8	0.554383	0.388951	0.357398	0.318766	0.304768	0.043591	0.925132	0.691970	0.841670	0.398186
9	0.355915	0.762548	0.151127	0.704959	0.240856	0.398876	0.593177	0.995358	0.691702	0.343456

10 rows × 10 columns

In [5]:

select(df, _["V3":"V5"])
#select(df, _["V5":"V3"])

# no num_range capability

Out[5]:

	V3	V6	V0	V8	V7	V4	V1	V5
0	0.551315	0.980764	0.696469	0.480932	0.684830	0.719469	0.286139	0.423106
1	0.059678	0.182492	0.343178	0.531551	0.175452	0.398044	0.729050	0.737995
...	...	...	...	...	...	...	...	...
8	0.388951	0.357398	0.318766	0.304768	0.043591	0.925132	0.691970	0.841670
9	0.762548	0.151127	0.704959	0.240856	0.398876	0.593177	0.995358	0.691702

10 rows × 8 columns

In [6]:

# can exclude matches
select(df_iris, -_.startswith("petal"))

Out[6]:

	sepal length (cm)	sepal width (cm)	species
0	5.1	3.5	setosa
1	4.9	3.0	setosa
2	4.7	3.2	setosa
3	4.6	3.1	setosa
4	5.0	3.6	setosa

In [7]:

# select can grab specific and rename columns
select(df_iris, _.petal_length == _["petal length (cm)"])

# rename leaves all columns
rename(df_iris, petal_length = "petal length (cm)")

Out[7]:

	sepal length (cm)	sepal width (cm)	petal_length	petal width (cm)	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

In [8]:

# Uh--- TODO? should return error? In tidyverse does group rename
# pretty rare to see!
#select(df_iris, _.obs == _.startswith('s'))