#!/usr/bin/env python # coding: utf-8 # ## Iris Data # > 📢: **This document was used during early development of siuba. See the [select docs](https://siuba.readthedocs.io/en/latest/api_table_core/03_select.html).** # # Many different ways of selecting columns from the iris dataset. # In[1]: from siuba import * import pandas as pd pd.set_option('max_rows', 5) # In[2]: ## Rather than import the iris data from sklearn, I am just including the ## first 5 rows. # from sklearn import datasets # iris = datasets.load_iris() # df_iris = pd.DataFrame(iris.data, columns = iris.feature_names) # df_iris['species'] = iris.target_names[iris.target] df_iris = pd.DataFrame({ 'sepal length (cm)': [5.1, 4.9, 4.7, 4.6, 5.0], 'sepal width (cm)': [3.5, 3.0, 3.2, 3.1, 3.6], 'petal length (cm)': [1.4, 1.4, 1.3, 1.5, 1.4], 'petal width (cm)': [0.2, 0.2, 0.2, 0.2, 0.2], 'species': ['setosa', 'setosa', 'setosa', 'setosa', 'setosa'] }) # In[3]: # get sepal columns select(df_iris, _.startswith("sepal")) # get width measure columns # note method calls sent to df_iris.columns.str # so _.endswith("...") is equivalent to df_iris.columns.str.endswith("...") select(df_iris, _.endswith("width (cm)")) # movie species to front # _.endswith("") is a hack to get everything select(df_iris, _.species, _.endswith("")) # move sepal length to the back # first select all variables except Sepal.Length, then re select Sepal.Length select(df_iris, -_["sepal length (cm)"], _["sepal length (cm)"]) # ## Wide table # In[4]: import numpy as np from numpy.random import uniform, seed seed(123) df = pd.DataFrame(uniform(size = [10, 10])) df = df[np.array([3, 4, 7, 1, 9, 8, 5, 2, 6, 10]) - 1] df.columns = "V" + df.columns.astype(str) df # In[5]: select(df, _["V3":"V5"]) #select(df, _["V5":"V3"]) # no num_range capability # In[6]: # can exclude matches select(df_iris, -_.startswith("petal")) # In[7]: # select can grab specific and rename columns select(df_iris, _.petal_length == _["petal length (cm)"]) # rename leaves all columns rename(df_iris, petal_length = "petal length (cm)") # In[8]: # Uh--- TODO? should return error? In tidyverse does group rename # pretty rare to see! #select(df_iris, _.obs == _.startswith('s'))