In [7]:

pip install pydataset

Collecting pydataset
  Downloading pydataset-0.2.0.tar.gz (15.9 MB)
     |████████████████████████████████| 15.9 MB 3.8 MB/s 
Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from pydataset) (1.3.5)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pydataset) (1.21.6)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pydataset) (2022.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pydataset) (2.8.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->pydataset) (1.15.0)
Building wheels for collected packages: pydataset
  Building wheel for pydataset (setup.py) ... done
  Created wheel for pydataset: filename=pydataset-0.2.0-py3-none-any.whl size=15939432 sha256=a806d6fc5bf803cbd7f0adf0dadb93635522366f99f7334a9dcb209087a69360
  Stored in directory: /root/.cache/pip/wheels/32/26/30/d71562a19eed948eaada9a61b4d722fa358657a3bfb5d151e2
Successfully built pydataset
Installing collected packages: pydataset
Successfully installed pydataset-0.2.0

In [25]:

# Import package
from pydataset import data
# Check out datasets
print(data())

        dataset_id                                             title
0    AirPassengers       Monthly Airline Passenger Numbers 1949-1960
1          BJsales                 Sales Data with Leading Indicator
2              BOD                         Biochemical Oxygen Demand
3     Formaldehyde                     Determination of Formaldehyde
4     HairEyeColor         Hair and Eye Color of Statistics Students
..             ...                                               ...
752        VerbAgg                  Verbal Aggression item responses
753           cake                 Breakage Angle of Chocolate Cakes
754           cbpp                 Contagious bovine pleuropneumonia
755    grouseticks  Data on red grouse ticks from Elston et al. 2001
756     sleepstudy       Reaction times in a sleep deprivation study

[757 rows x 2 columns]

In [9]:

# Create a function to glimpse the data
def glimpse(df):
    print(f"{df.shape[0]} rows and {df.shape[1]} columns")
    display(df.head())
    display(df.tail())

In [12]:

# Load as a dataframe
df = data('cake')
glimpse(df)

270 rows and 5 columns

	replicate	recipe	temperature	angle	temp
1	1	A	175	42	175
2	1	A	185	46	185
3	1	A	195	47	195
4	1	A	205	39	205
5	1	A	215	53	215

	replicate	recipe	temperature	angle	temp
266	15	C	185	28	185
267	15	C	195	25	195
268	15	C	205	25	205
269	15	C	215	31	215
270	15	C	225	25	225

In [13]:

# Import seaborn
import seaborn as sns
# Check out available datasets
print(sns.get_dataset_names())

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'geyser', 'iris', 'mpg', 'penguins', 'planets', 'taxis', 'tips', 'titanic']

In [17]:

df = sns.load_dataset('flights')
glimpse(df)

144 rows and 3 columns

	year	month	passengers
0	1949	Jan	112
1	1949	Feb	118
2	1949	Mar	132
3	1949	Apr	129
4	1949	May	121

	year	month	passengers
139	1960	Aug	606
140	1960	Sep	508
141	1960	Oct	461
142	1960	Nov	390
143	1960	Dec	432

In [18]:

# Import package
from sklearn.datasets import fetch_california_housing
# Load data (will download the data if it's the first time loading)
housing = fetch_california_housing(as_frame=True)
# Create a dataframe
df = housing['data'].join(housing['target'])
glimpse(df)

20640 rows and 9 columns

	MedInc	HouseAge	AveRooms	AveBedrms	Population	AveOccup	Latitude	Longitude	MedHouseVal
0	8.3252	41.0	6.984127	1.023810	322.0	2.555556	37.88	-122.23	4.526
1	8.3014	21.0	6.238137	0.971880	2401.0	2.109842	37.86	-122.22	3.585
2	7.2574	52.0	8.288136	1.073446	496.0	2.802260	37.85	-122.24	3.521
3	5.6431	52.0	5.817352	1.073059	558.0	2.547945	37.85	-122.25	3.413
4	3.8462	52.0	6.281853	1.081081	565.0	2.181467	37.85	-122.25	3.422

	MedInc	HouseAge	AveRooms	AveBedrms	Population	AveOccup	Latitude	Longitude	MedHouseVal
20635	1.5603	25.0	5.045455	1.133333	845.0	2.560606	39.48	-121.09	0.781
20636	2.5568	18.0	6.114035	1.315789	356.0	3.122807	39.49	-121.21	0.771
20637	1.7000	17.0	5.205543	1.120092	1007.0	2.325635	39.43	-121.22	0.923
20638	1.8672	18.0	5.329513	1.171920	741.0	2.123209	39.43	-121.32	0.847
20639	2.3886	16.0	5.254717	1.162264	1387.0	2.616981	39.37	-121.24	0.894

In [19]:

# Import package
import statsmodels.api as sm
# Load data as a dataframe
df = sm.datasets.macrodata.load_pandas()['data']
glimpse(df)

/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

203 rows and 14 columns

	year	quarter	realgdp	realcons	realinv	realgovt	realdpi	cpi	m1	tbilrate	unemp	pop	infl	realint
0	1959.0	1.0	2710.349	1707.4	286.898	470.045	1886.9	28.98	139.7	2.82	5.8	177.146	0.00	0.00
1	1959.0	2.0	2778.801	1733.7	310.859	481.301	1919.7	29.15	141.7	3.08	5.1	177.830	2.34	0.74
2	1959.0	3.0	2775.488	1751.8	289.226	491.260	1916.4	29.35	140.5	3.82	5.3	178.657	2.74	1.09
3	1959.0	4.0	2785.204	1753.7	299.356	484.052	1931.3	29.37	140.0	4.33	5.6	179.386	0.27	4.06
4	1960.0	1.0	2847.699	1770.5	331.722	462.199	1955.5	29.54	139.6	3.50	5.2	180.007	2.31	1.19

	year	quarter	realgdp	realcons	realinv	realgovt	realdpi	cpi	m1	tbilrate	unemp	pop	infl	realint
198	2008.0	3.0	13324.600	9267.7	1990.693	991.551	9838.3	216.889	1474.7	1.17	6.0	305.270	-3.16	4.33
199	2008.0	4.0	13141.920	9195.3	1857.661	1007.273	9920.4	212.174	1576.5	0.12	6.9	305.952	-8.79	8.91
200	2009.0	1.0	12925.410	9209.2	1558.494	996.287	9926.4	212.671	1592.8	0.22	8.1	306.547	0.94	-0.71
201	2009.0	2.0	12901.504	9189.0	1456.678	1023.528	10077.5	214.469	1653.6	0.18	9.2	307.226	3.37	-3.19
202	2009.0	3.0	12990.341	9256.0	1486.398	1044.088	10040.6	216.385	1673.9	0.12	9.6	308.013	3.56	-3.44

In [32]:

pip install nltk

Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (3.2.5)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from nltk) (1.15.0)

In [33]:

# Import package
import nltk
# Download the corpus (only need to do once)
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.

Out[33]:

True

In [34]:

# Import packages
import pandas as pd
from nltk.corpus import movie_reviews
# Convert to dataframe
documents = []
for fileid in movie_reviews.fileids():
    tag, filename = fileid.split('/')
    documents.append((tag, movie_reviews.raw(fileid)))
df = pd.DataFrame(documents, columns=['target', 'document'])
glimpse(df)

2000 rows and 2 columns

	target	document
0	neg	plot : two teen couples go to a church party , drink and then drive . \nthey get into ...
1	neg	the happy bastard's quick movie review \ndamn that y2k bug . \nit's got a head start i...
2	neg	it is movies like these that make a jaded movie viewer thankful for the invention of t...
3	neg	" quest for camelot " is warner bros . ' first feature-length , fully-animated attemp...
4	neg	synopsis : a mentally unstable man undergoing psychotherapy saves a boy from a potenti...

	target	document
1995	pos	wow ! what a movie . \nit's everything a movie can be : funny , dramatic , interesting...
1996	pos	richard gere can be a commanding actor , but he's not always in great films . \neveryt...
1997	pos	glory--starring matthew broderick , denzel washington , and morgan freeman--is the tru...
1998	pos	steven spielberg's second epic film on world war ii is an unquestioned masterpiece of ...
1999	pos	truman ( " true-man " ) burbank is the perfect name for jim carrey's character in this...

	replicate	recipe	temperature	angle	temp
266	15	C	185	28	185
267	15	C	195	25	195
268	15	C	205	25	205
269	15	C	215	31	215
270	15	C	225	25	225

	replicate	recipe	temperature	angle	temp
266	15	C	185	28	185
267	15	C	195	25	195
268	15	C	205	25	205
269	15	C	215	31	215
270	15	C	225	25	225

	replicate	recipe	temperature	angle	temp
266	15	C	185	28	185
267	15	C	195	25	195
268	15	C	205	25	205
269	15	C	215	31	215
270	15	C	225	25	225