pip install pydataset
Collecting pydataset Downloading pydataset-0.2.0.tar.gz (15.9 MB) |████████████████████████████████| 15.9 MB 3.8 MB/s Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from pydataset) (1.3.5) Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pydataset) (1.21.6) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pydataset) (2022.1) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pydataset) (2.8.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->pydataset) (1.15.0) Building wheels for collected packages: pydataset Building wheel for pydataset (setup.py) ... done Created wheel for pydataset: filename=pydataset-0.2.0-py3-none-any.whl size=15939432 sha256=a806d6fc5bf803cbd7f0adf0dadb93635522366f99f7334a9dcb209087a69360 Stored in directory: /root/.cache/pip/wheels/32/26/30/d71562a19eed948eaada9a61b4d722fa358657a3bfb5d151e2 Successfully built pydataset Installing collected packages: pydataset Successfully installed pydataset-0.2.0
# Import package
from pydataset import data
# Check out datasets
print(data())
dataset_id title 0 AirPassengers Monthly Airline Passenger Numbers 1949-1960 1 BJsales Sales Data with Leading Indicator 2 BOD Biochemical Oxygen Demand 3 Formaldehyde Determination of Formaldehyde 4 HairEyeColor Hair and Eye Color of Statistics Students .. ... ... 752 VerbAgg Verbal Aggression item responses 753 cake Breakage Angle of Chocolate Cakes 754 cbpp Contagious bovine pleuropneumonia 755 grouseticks Data on red grouse ticks from Elston et al. 2001 756 sleepstudy Reaction times in a sleep deprivation study [757 rows x 2 columns]
# Create a function to glimpse the data
def glimpse(df):
print(f"{df.shape[0]} rows and {df.shape[1]} columns")
display(df.head())
display(df.tail())
# Load as a dataframe
df = data('cake')
glimpse(df)
270 rows and 5 columns
replicate | recipe | temperature | angle | temp | |
---|---|---|---|---|---|
1 | 1 | A | 175 | 42 | 175 |
2 | 1 | A | 185 | 46 | 185 |
3 | 1 | A | 195 | 47 | 195 |
4 | 1 | A | 205 | 39 | 205 |
5 | 1 | A | 215 | 53 | 215 |
replicate | recipe | temperature | angle | temp | |
---|---|---|---|---|---|
266 | 15 | C | 185 | 28 | 185 |
267 | 15 | C | 195 | 25 | 195 |
268 | 15 | C | 205 | 25 | 205 |
269 | 15 | C | 215 | 31 | 215 |
270 | 15 | C | 225 | 25 | 225 |
# Import seaborn
import seaborn as sns
# Check out available datasets
print(sns.get_dataset_names())
['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'geyser', 'iris', 'mpg', 'penguins', 'planets', 'taxis', 'tips', 'titanic']
df = sns.load_dataset('flights')
glimpse(df)
144 rows and 3 columns
year | month | passengers | |
---|---|---|---|
0 | 1949 | Jan | 112 |
1 | 1949 | Feb | 118 |
2 | 1949 | Mar | 132 |
3 | 1949 | Apr | 129 |
4 | 1949 | May | 121 |
year | month | passengers | |
---|---|---|---|
139 | 1960 | Aug | 606 |
140 | 1960 | Sep | 508 |
141 | 1960 | Oct | 461 |
142 | 1960 | Nov | 390 |
143 | 1960 | Dec | 432 |
# Import package
from sklearn.datasets import fetch_california_housing
# Load data (will download the data if it's the first time loading)
housing = fetch_california_housing(as_frame=True)
# Create a dataframe
df = housing['data'].join(housing['target'])
glimpse(df)
20640 rows and 9 columns
MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | MedHouseVal | |
---|---|---|---|---|---|---|---|---|---|
0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | MedHouseVal | |
---|---|---|---|---|---|---|---|---|---|
20635 | 1.5603 | 25.0 | 5.045455 | 1.133333 | 845.0 | 2.560606 | 39.48 | -121.09 | 0.781 |
20636 | 2.5568 | 18.0 | 6.114035 | 1.315789 | 356.0 | 3.122807 | 39.49 | -121.21 | 0.771 |
20637 | 1.7000 | 17.0 | 5.205543 | 1.120092 | 1007.0 | 2.325635 | 39.43 | -121.22 | 0.923 |
20638 | 1.8672 | 18.0 | 5.329513 | 1.171920 | 741.0 | 2.123209 | 39.43 | -121.32 | 0.847 |
20639 | 2.3886 | 16.0 | 5.254717 | 1.162264 | 1387.0 | 2.616981 | 39.37 | -121.24 | 0.894 |
# Import package
import statsmodels.api as sm
# Load data as a dataframe
df = sm.datasets.macrodata.load_pandas()['data']
glimpse(df)
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm
203 rows and 14 columns
year | quarter | realgdp | realcons | realinv | realgovt | realdpi | cpi | m1 | tbilrate | unemp | pop | infl | realint | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1959.0 | 1.0 | 2710.349 | 1707.4 | 286.898 | 470.045 | 1886.9 | 28.98 | 139.7 | 2.82 | 5.8 | 177.146 | 0.00 | 0.00 |
1 | 1959.0 | 2.0 | 2778.801 | 1733.7 | 310.859 | 481.301 | 1919.7 | 29.15 | 141.7 | 3.08 | 5.1 | 177.830 | 2.34 | 0.74 |
2 | 1959.0 | 3.0 | 2775.488 | 1751.8 | 289.226 | 491.260 | 1916.4 | 29.35 | 140.5 | 3.82 | 5.3 | 178.657 | 2.74 | 1.09 |
3 | 1959.0 | 4.0 | 2785.204 | 1753.7 | 299.356 | 484.052 | 1931.3 | 29.37 | 140.0 | 4.33 | 5.6 | 179.386 | 0.27 | 4.06 |
4 | 1960.0 | 1.0 | 2847.699 | 1770.5 | 331.722 | 462.199 | 1955.5 | 29.54 | 139.6 | 3.50 | 5.2 | 180.007 | 2.31 | 1.19 |
year | quarter | realgdp | realcons | realinv | realgovt | realdpi | cpi | m1 | tbilrate | unemp | pop | infl | realint | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
198 | 2008.0 | 3.0 | 13324.600 | 9267.7 | 1990.693 | 991.551 | 9838.3 | 216.889 | 1474.7 | 1.17 | 6.0 | 305.270 | -3.16 | 4.33 |
199 | 2008.0 | 4.0 | 13141.920 | 9195.3 | 1857.661 | 1007.273 | 9920.4 | 212.174 | 1576.5 | 0.12 | 6.9 | 305.952 | -8.79 | 8.91 |
200 | 2009.0 | 1.0 | 12925.410 | 9209.2 | 1558.494 | 996.287 | 9926.4 | 212.671 | 1592.8 | 0.22 | 8.1 | 306.547 | 0.94 | -0.71 |
201 | 2009.0 | 2.0 | 12901.504 | 9189.0 | 1456.678 | 1023.528 | 10077.5 | 214.469 | 1653.6 | 0.18 | 9.2 | 307.226 | 3.37 | -3.19 |
202 | 2009.0 | 3.0 | 12990.341 | 9256.0 | 1486.398 | 1044.088 | 10040.6 | 216.385 | 1673.9 | 0.12 | 9.6 | 308.013 | 3.56 | -3.44 |
pip install nltk
Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (3.2.5) Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from nltk) (1.15.0)
# Import package
import nltk
# Download the corpus (only need to do once)
nltk.download('movie_reviews')
[nltk_data] Downloading package movie_reviews to /root/nltk_data... [nltk_data] Unzipping corpora/movie_reviews.zip.
True
# Import packages
import pandas as pd
from nltk.corpus import movie_reviews
# Convert to dataframe
documents = []
for fileid in movie_reviews.fileids():
tag, filename = fileid.split('/')
documents.append((tag, movie_reviews.raw(fileid)))
df = pd.DataFrame(documents, columns=['target', 'document'])
glimpse(df)
2000 rows and 2 columns
target | document | |
---|---|---|
0 | neg | plot : two teen couples go to a church party , drink and then drive . \nthey get into ... |
1 | neg | the happy bastard's quick movie review \ndamn that y2k bug . \nit's got a head start i... |
2 | neg | it is movies like these that make a jaded movie viewer thankful for the invention of t... |
3 | neg | " quest for camelot " is warner bros . ' first feature-length , fully-animated attemp... |
4 | neg | synopsis : a mentally unstable man undergoing psychotherapy saves a boy from a potenti... |
target | document | |
---|---|---|
1995 | pos | wow ! what a movie . \nit's everything a movie can be : funny , dramatic , interesting... |
1996 | pos | richard gere can be a commanding actor , but he's not always in great films . \neveryt... |
1997 | pos | glory--starring matthew broderick , denzel washington , and morgan freeman--is the tru... |
1998 | pos | steven spielberg's second epic film on world war ii is an unquestioned masterpiece of ... |
1999 | pos | truman ( " true-man " ) burbank is the perfect name for jim carrey's character in this... |