📢: This document was used during early development of siuba. See the select docs.
Many different ways of selecting columns from the iris dataset.
from siuba import *
import pandas as pd
pd.set_option('max_rows', 5)
## Rather than import the iris data from sklearn, I am just including the
## first 5 rows.
# from sklearn import datasets
# iris = datasets.load_iris()
# df_iris = pd.DataFrame(iris.data, columns = iris.feature_names)
# df_iris['species'] = iris.target_names[iris.target]
df_iris = pd.DataFrame({
'sepal length (cm)': [5.1, 4.9, 4.7, 4.6, 5.0],
'sepal width (cm)': [3.5, 3.0, 3.2, 3.1, 3.6],
'petal length (cm)': [1.4, 1.4, 1.3, 1.5, 1.4],
'petal width (cm)': [0.2, 0.2, 0.2, 0.2, 0.2],
'species': ['setosa', 'setosa', 'setosa', 'setosa', 'setosa']
})
# get sepal columns
select(df_iris, _.startswith("sepal"))
# get width measure columns
# note method calls sent to df_iris.columns.str
# so _.endswith("...") is equivalent to df_iris.columns.str.endswith("...")
select(df_iris, _.endswith("width (cm)"))
# movie species to front
# _.endswith("") is a hack to get everything
select(df_iris, _.species, _.endswith(""))
# move sepal length to the back
# first select all variables except Sepal.Length, then re select Sepal.Length
select(df_iris, -_["sepal length (cm)"], _["sepal length (cm)"])
sepal width (cm) | petal length (cm) | petal width (cm) | species | sepal length (cm) | |
---|---|---|---|---|---|
0 | 3.5 | 1.4 | 0.2 | setosa | 5.1 |
1 | 3.0 | 1.4 | 0.2 | setosa | 4.9 |
2 | 3.2 | 1.3 | 0.2 | setosa | 4.7 |
3 | 3.1 | 1.5 | 0.2 | setosa | 4.6 |
4 | 3.6 | 1.4 | 0.2 | setosa | 5.0 |
import numpy as np
from numpy.random import uniform, seed
seed(123)
df = pd.DataFrame(uniform(size = [10, 10]))
df = df[np.array([3, 4, 7, 1, 9, 8, 5, 2, 6, 10]) - 1]
df.columns = "V" + df.columns.astype(str)
df
V2 | V3 | V6 | V0 | V8 | V7 | V4 | V1 | V5 | V9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.226851 | 0.551315 | 0.980764 | 0.696469 | 0.480932 | 0.684830 | 0.719469 | 0.286139 | 0.423106 | 0.392118 |
1 | 0.438572 | 0.059678 | 0.182492 | 0.343178 | 0.531551 | 0.175452 | 0.398044 | 0.729050 | 0.737995 | 0.531828 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8 | 0.554383 | 0.388951 | 0.357398 | 0.318766 | 0.304768 | 0.043591 | 0.925132 | 0.691970 | 0.841670 | 0.398186 |
9 | 0.355915 | 0.762548 | 0.151127 | 0.704959 | 0.240856 | 0.398876 | 0.593177 | 0.995358 | 0.691702 | 0.343456 |
10 rows × 10 columns
select(df, _["V3":"V5"])
#select(df, _["V5":"V3"])
# no num_range capability
V3 | V6 | V0 | V8 | V7 | V4 | V1 | V5 | |
---|---|---|---|---|---|---|---|---|
0 | 0.551315 | 0.980764 | 0.696469 | 0.480932 | 0.684830 | 0.719469 | 0.286139 | 0.423106 |
1 | 0.059678 | 0.182492 | 0.343178 | 0.531551 | 0.175452 | 0.398044 | 0.729050 | 0.737995 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
8 | 0.388951 | 0.357398 | 0.318766 | 0.304768 | 0.043591 | 0.925132 | 0.691970 | 0.841670 |
9 | 0.762548 | 0.151127 | 0.704959 | 0.240856 | 0.398876 | 0.593177 | 0.995358 | 0.691702 |
10 rows × 8 columns
# can exclude matches
select(df_iris, -_.startswith("petal"))
sepal length (cm) | sepal width (cm) | species | |
---|---|---|---|
0 | 5.1 | 3.5 | setosa |
1 | 4.9 | 3.0 | setosa |
2 | 4.7 | 3.2 | setosa |
3 | 4.6 | 3.1 | setosa |
4 | 5.0 | 3.6 | setosa |
# select can grab specific and rename columns
select(df_iris, _.petal_length == _["petal length (cm)"])
# rename leaves all columns
rename(df_iris, petal_length = "petal length (cm)")
sepal length (cm) | sepal width (cm) | petal_length | petal width (cm) | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# Uh--- TODO? should return error? In tidyverse does group rename
# pretty rare to see!
#select(df_iris, _.obs == _.startswith('s'))