#!/usr/bin/env python
# coding: utf-8
#
#
# # Explore and download books from the Gutenberg Books collection
#
# This Jupyter Notebook provides an interactive exploration and downloading interface for the Gutenberg Books Collection.
#
# Explore the vast collection of books, analyze metadata, and download selected texts based on various criteria. Dive into literary exploration and access timeless classics with ease
#
# **Note:** you can execute the whole "Preliminaries" section while it is collapsed by clicking on the "run" icon. Once all the cells in the "Preliminaries" section have been executed, all other cells can be executed independently of one another.
#
# ![preliminaries_collapse.png]()
# # Preliminaries
# ## Main class `GutenbergBooks`
# In[1]:
import gzip
import urllib.request
import requests
import os
import io
import pandas as pd
GUTENBERG_URL = "https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv.gz"
GUTENBERG_CACHEDIR = "GutenbergBooks"
pd.options.mode.copy_on_write = True
class GutenbergBooks:
def __init__(self):
self.catalog_url = GUTENBERG_URL
self.catalog_file = self.catalog_url.rsplit('/', 1)[-1][:-3]
self.is_cached = os.path.isfile(self.catalog_file)
self.catalog = self.fetch_catalog()
self.all_subjects = self.get_subjects()
self.cachedir = GUTENBERG_CACHEDIR
if not os.path.exists(self.cachedir):
os.makedirs(self.cachedir)
def is_cached(self):
if os.path.isfile(self.catalog_file):
return True
return False
def cache_catalog(self):
self.catalog = self.fetch_catalog(use_cache=False)
self.catalog.to_csv(self.catalog_file)
self.is_cached = True
def is_book_downloaded(self, bookID):
book_file = f"pg{bookID}.txt"
if os.path.isfile(os.path.join(GUTENBERG_CACHEDIR, book_file)):
return True
return False
def fetch_catalog(self, use_cache=True):
url = self.catalog_url
filename = self.catalog_file
if self.is_cached and use_cache:
print(f"Retrieving {filename} from cache. To refresh cache use cache_catalog()")
dataframe = pd.read_csv(filename, quotechar = '"')
return dataframe
try:
# Retrieve the compressed file from the URL
print(f"Retrieving {filename} from {url}.")
response = urllib.request.urlopen(url)
compressed_data = response.read()
# Decompress the data
decompressed_data = gzip.decompress(compressed_data)
# Load decompressed data into pandas DataFrame
dataframe = pd.read_csv(io.StringIO(decompressed_data.decode('utf-8')), quotechar = '"')
dataframe.to_csv(self.catalog_file)
self.is_cached = True
return dataframe
except Exception as e:
print("An error occurred:", e)
return None
def get_subjects(self):
return self.catalog['Subjects'].str.split('; ').explode().unique().tolist()
def random_subjects(self, n, seed):
"""
This method returns n random subjects.
Parameters:
- n (int): number of subjects.
- seed (int): random seed for reproducibility.
Returns:
- list: Random sample of subjects from the Gutenberg Books catalog
following the subjects distribution.
"""
df = self.catalog['Subjects']
subject_counts = df.str.split('; ').explode() \
.groupby(df.str.split('; ').explode()).count() \
.reset_index(name='Count').sort_values(by='Count', ascending=False) \
.rename(columns={"Subjects": "Subject"}) \
.reset_index(drop=True)
return subject_counts.sample(n=n, replace=False, random_state=seed, weights=subject_counts['Count'])
def topn_subjects(self, n):
df = self.catalog['Subjects']
subject_counts = df.str.split('; ').explode() \
.groupby(df.str.split('; ').explode()).count() \
.reset_index(name='Count').sort_values(by='Count', ascending=False) \
.rename(columns={"Subjects": "Subject"})
return subject_counts.reset_index(drop=True).head(n)
def get_authors(self):
return self.catalog['Authors'].str.split('; ').explode().unique().tolist()
def random_authors(self, n, seed):
df = self.catalog['Authors']
author_counts = df.str.split('; ').explode() \
.groupby(df.str.split('; ').explode()).count() \
.reset_index(name='Count').sort_values(by='Count', ascending=False) \
.rename(columns={"Authors": "Author"}) \
.reset_index(drop=True)
return author_counts.sample(n=n, replace=False, random_state=seed, weights=author_counts['Count'])
def topn_authors(self, n):
df = self.catalog['Authors']
author_counts = df.str.split('; ').explode() \
.groupby(df.str.split('; ').explode()).count() \
.reset_index(name='Count').sort_values(by='Count', ascending=False) \
.rename(columns={"Authors": "Author"})
return author_counts.reset_index(drop=True).head(n)
def get_languages(self):
return self.catalog['Language'].str.split('; ').explode().unique().tolist()
def topn_languages(self, n):
df = self.catalog['Language']
language_counts = df.str.split('; ').explode() \
.groupby(df.str.split('; ').explode()).count() \
.reset_index(name='Count').sort_values(by='Count', ascending=False)
return language_counts.reset_index(drop=True).head(n)
def get_bookshelves(self):
return self.catalog['Bookshelves'].str.split('; ').explode().unique().tolist()
def topn_bookshelves(self, n):
df = self.catalog['Bookshelves']
bookshelf_counts = df.str.split('; ').explode() \
.groupby(df.str.split('; ').explode()).count() \
.reset_index(name='Count').sort_values(by='Count', ascending=False) \
.rename(columns={"Bookshelves": "Bookshelf"})
return bookshelf_counts.reset_index(drop=True).head(n)
def get_types(self):
return self.catalog['Type'].unique().tolist()
def get_books(self, lang, subject, title):
return self.catalog.sample(n=n, replace=False, random_state=seed)
def random_books(self, n, seed):
return self.catalog.sample(n=n, replace=False, random_state=seed)
def books_matching_subject(self, substr):
return self.catalog.query(f'Subjects.str.lower().str.contains("{substr.lower()}", na=False)')
def books_matching_author(self, substr):
return self.catalog.query(f'Author.str.lower().str.contains("{substr.lower()}", na=False)')
def books_matching_year(self, given_year):
"""
Find books from the catalog that match a given year within the birth-death intervals of authors.
Parameters:
- given_year (int): The year to match within the birth-death intervals of authors.
Returns:
- DataFrame: A DataFrame containing books from the catalog where the given year falls within
the birth-death intervals of authors.
This method extracts birth and death years from the 'Authors' column of the catalog and filters
rows where the given year is within any birth-death interval. It returns a DataFrame of matching books.
"""
catalog_copy = self.catalog.copy()
# Create a temporary DataFrame to hold split author-interval pairs
temp_df = catalog_copy['Authors'].str.extractall(r'((?:\w+\s+)?(?:\d{4})\s*-\s*(?:\d{4}))')
temp_df.reset_index(inplace=True)
temp_df.rename(columns={0: 'Author_Interval'}, inplace=True)
# Merge the original catalog with the temporary DataFrame
merged_df = pd.merge(catalog_copy, temp_df, left_index=True, right_on='level_0')
# Extract birth and death years from the author-interval pairs
merged_df['Birth_Year'] = merged_df['Author_Interval'].str.extract(r'(\d{4})')
merged_df['Death_Year'] = merged_df['Author_Interval'].str.extract(r'\d{4}\s*-\s*(\d{4})')
# Convert birth and death years to numeric
merged_df['Birth_Year'] = pd.to_numeric(merged_df['Birth_Year'], errors='coerce')
merged_df['Death_Year'] = pd.to_numeric(merged_df['Death_Year'], errors='coerce')
# Filter rows where the given year is within any birth-death interval
matching_books = merged_df[(merged_df['Birth_Year'] <= given_year) &
(merged_df['Death_Year'] >= given_year)]
# Drop unnecessary columns
matching_books.drop(columns=['Author_Interval', 'level_0'], inplace=True)
# Return matching books
return matching_books
def download_book(self, nr):
"""
Download one book from the Gutenberg collection identified by its id.
If the book already exists in the cache folder, it is not downloaded again.
Parameters:
- nr (int): id of the book in the Gutenberg books collection.
Returns:
- str: the path where the book was downloaded.
"""
b = str(nr)
book = f"pg{b}.txt"
url = f"https://www.gutenberg.org/cache/epub/{b}/{book}"
book_path = os.path.join(GUTENBERG_CACHEDIR, book)
if self.is_book_downloaded(b):
print(f"Book {nr} already exists in cache. Not downloading.")
else:
try:
# Retrieve the book from the URL
print(f"Retrieving {book} from {url}.")
with open(book_path, "w") as f:
f.write(requests.get(url).text)
except Exception as e:
print("An error occurred:", e)
return None
return book_path
def download_books(self, books):
"""
Download a list of books from the Gutenberg collection.
If a book already exists in the cache folder, it is not downloaded again.
Parameters:
- books (list): list of ids of books in the Gutenberg books collection.
Returns:
- str: the path where the book was downloaded.
"""
book_paths = []
for b in books:
path =self.download_book(b)
book_paths += [path]
return book_paths
def download_n_books(self, n, subject):
"""
Download a certain number of books from the Gutenberg collection based on the desired size and subject.
If a book already exists in the cache folder, it is not downloaded again.
Parameters:
- n (int): The number of books to download.
- subject (str): The subject to match when selecting books.
Returns:
- list: A list of paths where the downloaded books are saved.
"""
# Get books matching the subject
matching_books = self.books_matching_subject(subject)
# Limit the number of books to download
books_to_download = matching_books[:n]['Text#']
# Download books
book_paths = [self.download_book(b) for b in books_to_download]
return book_paths
def download_size_books(self, size_mb=128, subject=None):
"""
Download books from the Gutenberg collection based on the desired total size and subject.
If a book already exists in the cache folder, it is not downloaded again.
Parameters:
- size_mb (int): The desired total size of downloaded books in MB. Default is 128MB.
- subject (str, optional): The subject to match when selecting books. Default is None.
Returns:
- list: A list of paths where the downloaded books are saved.
"""
# Get books matching the subject if provided
if subject:
matching_books = self.books_matching_subject(subject)['Text#']
else:
matching_books = self.catalog['Text#']
# Initialize variables
total_size = 0
books_to_download = []
# Iterate through matching books until total size threshold is met
for b in matching_books:
if total_size >= size_mb * 1024 * 1024: # Convert MB to bytes
break
book_path = self.download_book(b)
file_size = os.path.getsize(book_path)
# Add file size to total size
total_size += file_size
# Add book to download list
books_to_download.append(b)
# Download books
book_paths = [self.download_book(b) for b in books_to_download]
print(f"Total size: {int(total_size/1024/1024)}MB")
if total_size <= size_mb * 1024 * 1024:
print(f"Download more books to get {size_mb}MB")
return book_paths
gb = GutenbergBooks()
# ## Use `cache_catalog()` to create a cached copy of the catalog
# In[2]:
# gb.cache_catalog()
# ## Interactive tables
#
# Library `data_table` from Google Colab adds interactivity to Pandas tables.
#
# https://colab.research.google.com/notebooks/data_table.ipynb
# In[3]:
# true if running on Google Colab
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
from google.colab import data_table
from vega_datasets import data
data_table.enable_dataframe_formatter()
else:
get_ipython().system('pip install itables')
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
# ## Code for visualizations
#
# This is needed for plotting.
# In[4]:
import matplotlib
colors = matplotlib.cm.tab20(range(20))
# source: https://matplotlib.org/stable/gallery/misc/packed_bubbles.html
import matplotlib.pyplot as plt
import numpy as np
class BubbleChart:
def __init__(self, area, bubble_spacing=0):
"""
Setup for bubble collapse.
Parameters
----------
area : array-like
Area of the bubbles.
bubble_spacing : float, default: 0
Minimal spacing between bubbles after collapsing.
Notes
-----
If "area" is sorted, the results might look weird.
"""
area = np.asarray(area)
r = np.sqrt(area / np.pi)
self.bubble_spacing = bubble_spacing
self.bubbles = np.ones((len(area), 4))
self.bubbles[:, 2] = r
self.bubbles[:, 3] = area
self.maxstep = 2 * self.bubbles[:, 2].max() + self.bubble_spacing
self.step_dist = self.maxstep / 2
# calculate initial grid layout for bubbles
length = np.ceil(np.sqrt(len(self.bubbles)))
grid = np.arange(length) * self.maxstep
gx, gy = np.meshgrid(grid, grid)
self.bubbles[:, 0] = gx.flatten()[:len(self.bubbles)]
self.bubbles[:, 1] = gy.flatten()[:len(self.bubbles)]
self.com = self.center_of_mass()
def center_of_mass(self):
return np.average(
self.bubbles[:, :2], axis=0, weights=self.bubbles[:, 3]
)
def center_distance(self, bubble, bubbles):
return np.hypot(bubble[0] - bubbles[:, 0],
bubble[1] - bubbles[:, 1])
def outline_distance(self, bubble, bubbles):
center_distance = self.center_distance(bubble, bubbles)
return center_distance - bubble[2] - \
bubbles[:, 2] - self.bubble_spacing
def check_collisions(self, bubble, bubbles):
distance = self.outline_distance(bubble, bubbles)
return len(distance[distance < 0])
def collides_with(self, bubble, bubbles):
distance = self.outline_distance(bubble, bubbles)
return np.argmin(distance, keepdims=True)
def collapse(self, n_iterations=50):
"""
Move bubbles to the center of mass.
Parameters
----------
n_iterations : int, default: 100
Number of moves to perform.
"""
for _i in range(n_iterations):
moves = 0
for i in range(len(self.bubbles)):
rest_bub = np.delete(self.bubbles, i, 0)
# try to move directly towards the center of mass
# direction vector from bubble to the center of mass
dir_vec = self.com - self.bubbles[i, :2]
# shorten direction vector to have length of 1
dir_vec = dir_vec / np.sqrt(dir_vec.dot(dir_vec))
# calculate new bubble position
new_point = self.bubbles[i, :2] + dir_vec * self.step_dist
new_bubble = np.append(new_point, self.bubbles[i, 2:4])
# check whether new bubble collides with other bubbles
if not self.check_collisions(new_bubble, rest_bub):
self.bubbles[i, :] = new_bubble
self.com = self.center_of_mass()
moves += 1
else:
# try to move around a bubble that you collide with
# find colliding bubble
for colliding in self.collides_with(new_bubble, rest_bub):
# calculate direction vector
dir_vec = rest_bub[colliding, :2] - self.bubbles[i, :2]
dir_vec = dir_vec / np.sqrt(dir_vec.dot(dir_vec))
# calculate orthogonal vector
orth = np.array([dir_vec[1], -dir_vec[0]])
# test which direction to go
new_point1 = (self.bubbles[i, :2] + orth *
self.step_dist)
new_point2 = (self.bubbles[i, :2] - orth *
self.step_dist)
dist1 = self.center_distance(
self.com, np.array([new_point1]))
dist2 = self.center_distance(
self.com, np.array([new_point2]))
new_point = new_point1 if dist1 < dist2 else new_point2
new_bubble = np.append(new_point, self.bubbles[i, 2:4])
if not self.check_collisions(new_bubble, rest_bub):
self.bubbles[i, :] = new_bubble
self.com = self.center_of_mass()
if moves / len(self.bubbles) < 0.1:
self.step_dist = self.step_dist / 2
def plot(self, ax, labels, colors):
"""
Draw the bubble plot.
Parameters
----------
ax : matplotlib.axes.Axes
labels : list
Labels of the bubbles.
colors : list
Colors of the bubbles.
"""
for i in range(len(self.bubbles)):
circ = plt.Circle(
self.bubbles[i, :2], self.bubbles[i, 2], color=colors[i])
ax.add_patch(circ)
ax.text(*self.bubbles[i, :2], labels[i],
horizontalalignment='center', verticalalignment='center')
# Attempt to set the font family
desired_font_family = 'DejaVu Serif'
try:
plt.rcParams['font.family'] = desired_font_family
print(f"Using '{desired_font_family}' font family.")
except:
print(f"Warning: Font family '{desired_font_family}' not found. Using fallback font.")
plt.rcParams['font.family'] = 'serif' # Fallback to a generic serif font
# # Explore
# ## Books
# ### All books
#
# The whole Gutenberg collection catalog is saved in the `catalog` of the `GutenbergBooks` object `gb`.
# In[5]:
gb.catalog
# ### Count books in the collection
#
# There are currently $73109$ books in the collection.
# In[6]:
len(gb.catalog)
# ### First five books in the catalog
# In[7]:
gb.catalog.head(5)
# ### Five random books
#
# Looking only at the first lines of a DataFrame might provide an initial glimpse into the data, but it can be insufficient for gaining a comprehensive understanding of its characteristics, that's why sampling from the DataFrame is often more beneficial.
#
# So, let's break away from the norm of quickly scanning the first few lines of a file with the `head` command. Let us instead allocate a bit more computational power and extract a small yet representative sample of the data.
# In[8]:
print("Five random books from catalog")
gb.random_books(n=5, seed=42)
# ## Subjects
# ### Count distinct subjects
#
# There are currently $39619$ distinct subjects.
# In[9]:
len(gb.get_subjects())
# ### Top $n$ subjects
# In[10]:
n = 10
gb.topn_subjects(n)
# ### 20K subjects
# In[11]:
pd.DataFrame(gb.topn_subjects(20000))
# Limiting the number of rows to 20000 because this is the maximum number supported
# by Colab's `data_table`.
# ### Ten random subjects
# In[12]:
gb.random_subjects(10, 42).sort_values(by='Count', ascending=False)
# ### List books matching a given subject
#
# Change the subject by setting the variable `my_subject` (search is case-insensitive).
# In[13]:
substr = "description and travel"
gb.books_matching_subject(substr).head()
# ### Visualize most frequent subjects
# In[14]:
n = 20
gutenberg_books_subjects = {
'subjects': gb.topn_subjects(n)['Subject'].replace({' -- ': '\n'}, regex=True).to_list(),
'market_share': list(map(lambda x: x*n*3, gb.topn_subjects(n)['Count'].to_list())),
'color': colors[:n]
}
bubble_chart = BubbleChart(area=gutenberg_books_subjects['market_share'],
bubble_spacing=2*n)
bubble_chart.collapse()
fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"), figsize=(10, 10))
bubble_chart.plot(
ax, gutenberg_books_subjects['subjects'], gutenberg_books_subjects['color'])
ax.axis("off")
ax.relim()
ax.autoscale_view()
ax.set_title(f'Gutenberg books top {n} subjects')
plt.show()
# ## Authors
# ### Count distinct authors
#
# There are currently $37392$ distinct authors.
# In[15]:
len(gb.get_authors())
# ### All authors
# The `data_table` library can only deal with a maximum of $20000$ rows. If the number of rows exceeds this limit, the usual Pandas display is used (with no interactivity).
#
# In[16]:
pd.DataFrame(gb.get_authors())
# ### Top $n$ authors
# In[17]:
n = 20000
gb.topn_authors(n)
# ### Ten random authors
# In[18]:
gb.random_authors(10, 42).sort_values(by='Count', ascending=False)
# ### Visualize most frequent authors
# In[19]:
n = 20
gutenberg_books_authors = {
'authors': gb.topn_authors(n)['Author'].replace({', ': '\n', ' \[': '\n['}, regex=True).to_list(),
'market_share': list(map(lambda x: x*n*3, gb.topn_authors(n)['Count'].to_list())),
'color': colors[:n]
}
bubble_chart = BubbleChart(area=gutenberg_books_authors['market_share'],
bubble_spacing=2*n)
bubble_chart.collapse()
fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"),figsize=(10, 10))
bubble_chart.plot(
ax, gutenberg_books_authors['authors'], gutenberg_books_authors['color'])
ax.axis("off")
ax.relim()
ax.autoscale_view()
ax.set_title(f'Gutenberg books top {n} authors')
plt.show()
# ## Types
# ### All types
# In[20]:
pd.DataFrame(gb.get_types(), columns=['Type'])
# ### Count books by types
# In[21]:
grouped_counts = gb.catalog.groupby('Type').size().reset_index(name='Count')
grouped_counts
# ### Visualize types
# In[22]:
grouped_data = gb.catalog.groupby('Type').size().reset_index(name='Count')
n = len(grouped_data)
# Extracting values of 'Type' and 'Count' columns as lists
type_list = grouped_data['Type'].tolist()
count_list = grouped_data['Count'].tolist()
gutenberg_books_types = {
'types': type_list,
# adapt the size of smaller items
'market_share': list(map(lambda x: x if x>1000 else x*n*10, count_list)),
'color': colors[:-n]
}
bubble_chart = BubbleChart(area=gutenberg_books_types['market_share'],
bubble_spacing=2*n)
bubble_chart.collapse()
fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"),figsize=(10, 10))
bubble_chart.plot(
ax, gutenberg_books_types['types'], gutenberg_books_types['color'])
ax.axis("off")
ax.relim()
ax.autoscale_view()
ax.set_title(f'Gutenberg books top types')
subtitle = "(the depicted proportions have been altered and do not reflect the true distribution)"
# Set the subtitle below the main title
plt.text(0.5, 0.98, subtitle, fontsize=10, ha='center', transform=plt.gca().transAxes)
plt.show()
# I wasn't aware that the Gutenberg collection contained data other than text. I'll need to explore these additional data types at some point.
# ## Bookshelves
# ### Top $n$ bookshelves
# In[23]:
n = 10
gb.topn_bookshelves(n)
# ### Visualize most frequent bookshelves
# In[24]:
n = 20
gutenberg_books_bookshelves = {
'bookshelves': gb.topn_bookshelves(n)['Bookshelf'].replace({', ': '\n'}, regex=True).to_list(),
'market_share': list(map(lambda x: x*n*3, gb.topn_bookshelves(n)['Count'].to_list())),
'color': colors[:n]
}
bubble_chart = BubbleChart(area=gutenberg_books_bookshelves['market_share'],
bubble_spacing=4*n)
bubble_chart.collapse()
fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"), figsize=(10, 10))
bubble_chart.plot(
ax, gutenberg_books_bookshelves['bookshelves'], gutenberg_books_bookshelves['color'])
ax.axis("off")
ax.relim()
ax.autoscale_view()
ax.set_title(f'Gutenberg books top {n} bookshelves')
plt.show()
# ### Books without bookshelf
#
# Many books do not belong to any bookshelf
# In[25]:
gb.catalog.count()
# ### Number of books without bookshelf
# In[26]:
print(f"Number of books with no bookshelf: {gb.catalog[gb.catalog['Bookshelves'].isna()].shape[0]}")
# ### Five random books without bookshelf
# In[27]:
gb.catalog[gb.catalog['Bookshelves'].isna()].sample(n=5, replace=False, random_state=42)
# ## Languages
# ### Count distinct languages
#
# The Gutenberg collection currently comprises 68 languages.
# In[28]:
len(gb.get_languages())
# ### Top $n$ languages
# In[29]:
gb.topn_languages(10)
# ### Visualize top $n$ languages
# In[30]:
n = 20
gutenberg_books_languages = {
'languages': gb.topn_languages(n)['Language'].to_list(),
'market_share': list(map(lambda x: x*10, gb.topn_languages(n)['Count'].to_list())),
'color': colors
}
bubble_chart = BubbleChart(area=gutenberg_books_languages['market_share'],
bubble_spacing=15)
bubble_chart.collapse()
fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"), figsize=(10, 10))
bubble_chart.plot(
ax, gutenberg_books_languages['languages'], gutenberg_books_languages['color'])
ax.axis("off")
ax.relim()
ax.autoscale_view()
ax.set_title(f'Gutenberg books top {n} languages')
plt.show()
# ## Match books using various criteria
# ### Match books by subject
# In[31]:
substr = "description and travel"
gb.books_matching_subject(substr)
# ### Match books by year
# In[32]:
gb.books_matching_year(1984)
# In[33]:
help(GutenbergBooks.books_matching_year)
# # Downloading files from the Gutenberg collection
#
# ⚠️ Please read carefully this notice about the Gutenberg Project's policies on bulk downloading:
#
#
#
# > “ _The Project Gutenberg website is intended for human users only. Any perceived use of automated tools to access the Project Gutenberg website will result in a temporary or permanent block of your IP address._ ”
#
# See: https://www.gutenberg.org/policy/robot_access.html.
#
# ## The cache directory
#
# By default, `GutenbergBooks` is the directory where all downloaded books are stored. If a book is alredy in the `GutenbergBooks` directory it won't be downloaded again.
#
# ⚠️ The cache directory is empty when you start your Google Colab session! ⚠️
# ## Download one book
# In[34]:
gb.download_book(5687)
# The same book won't be downloaded because it already exists in the cache directory `GutenbergBooks`.
# In[35]:
gb.download_book(5687)
# In[36]:
help(GutenbergBooks.download_book)
# ## Download multiple books
# In[37]:
gb.download_books([5678, 5679, 5680])
# In[38]:
help(GutenbergBooks.download_books)
# ## Download $n$ books by subject
# In[39]:
gb.download_n_books(5, "\(South Africa\) -- Description and travel")
# ## Download a given amount of books by subject
#
# DOwnload books matching a certain subject. Stop when the threshold given by the `size_mb` (size in Megabytes) parameter is reached.
#
# If not specified, `size_mb` is $128$ (the default Hadoop block size).
# In[40]:
gb.download_size_books(subject="\(South Africa\) -- Description and travel")
# In[41]:
get_ipython().system('du -sh GutenbergBooks')
# In[42]:
subject = "United States -- Description and travel"
gb.download_size_books(size_mb=90, subject=subject)
# In[43]:
get_ipython().system('du -sh GutenbergBooks')
# It's not easy to get enough data!
# In[44]:
subject = "California -- Description and travel"
gb.download_size_books(size_mb=50, subject=subject)
# In[45]:
get_ipython().system('du -sh GutenbergBooks')
# # Acknowledgements and some thoughts on Artificial Intelligence
#
# For this tutorial I've made extensive use of the ChatGPT (version $3.5$) AI to:
# - improve my English
# - define code structure
# - write Python code snippets
# - document code
#
# I ideated, organized, adapted, double-checked all content (both text and code) with the aim of creating a useful tool for exploring the Gutenberg books collection and providing a pleasant user experience.
#
# I can imagine that in the future AI will be able to write such tutorials on their own and then the role of a tutorial author will be limited to defining requirements. Maybe there are going to be self-adapting tutorials that create themselves on the spot according to the needs of the readers, thus bypassing the need for tutorial authors. What are tutorial authors going to do then? Something else 😀!
#
#
# In this spirit, I'd like to thank everyone who contributed to the common sense language collection (both natural and programming languages) used to train ChatGPT, the creators of ChatGPT, and the companies making it available as a comfortable Web application.