#!/usr/bin/env python # coding: utf-8 # Open In Colab #
Logo Big Data for Beginners
# # Explore and download books from the Gutenberg Books collection # # This Jupyter Notebook provides an interactive exploration and downloading interface for the Gutenberg Books Collection. # # Explore the vast collection of books, analyze metadata, and download selected texts based on various criteria. Dive into literary exploration and access timeless classics with ease # # **Note:** you can execute the whole "Preliminaries" section while it is collapsed by clicking on the "run" icon. Once all the cells in the "Preliminaries" section have been executed, all other cells can be executed independently of one another. # # ![preliminaries_collapse.png]() # # Preliminaries # ## Main class `GutenbergBooks` # In[1]: import gzip import urllib.request import requests import os import io import pandas as pd GUTENBERG_URL = "https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv.gz" GUTENBERG_CACHEDIR = "GutenbergBooks" pd.options.mode.copy_on_write = True class GutenbergBooks: def __init__(self): self.catalog_url = GUTENBERG_URL self.catalog_file = self.catalog_url.rsplit('/', 1)[-1][:-3] self.is_cached = os.path.isfile(self.catalog_file) self.catalog = self.fetch_catalog() self.all_subjects = self.get_subjects() self.cachedir = GUTENBERG_CACHEDIR if not os.path.exists(self.cachedir): os.makedirs(self.cachedir) def is_cached(self): if os.path.isfile(self.catalog_file): return True return False def cache_catalog(self): self.catalog = self.fetch_catalog(use_cache=False) self.catalog.to_csv(self.catalog_file) self.is_cached = True def is_book_downloaded(self, bookID): book_file = f"pg{bookID}.txt" if os.path.isfile(os.path.join(GUTENBERG_CACHEDIR, book_file)): return True return False def fetch_catalog(self, use_cache=True): url = self.catalog_url filename = self.catalog_file if self.is_cached and use_cache: print(f"Retrieving {filename} from cache. To refresh cache use cache_catalog()") dataframe = pd.read_csv(filename, quotechar = '"') return dataframe try: # Retrieve the compressed file from the URL print(f"Retrieving {filename} from {url}.") response = urllib.request.urlopen(url) compressed_data = response.read() # Decompress the data decompressed_data = gzip.decompress(compressed_data) # Load decompressed data into pandas DataFrame dataframe = pd.read_csv(io.StringIO(decompressed_data.decode('utf-8')), quotechar = '"') dataframe.to_csv(self.catalog_file) self.is_cached = True return dataframe except Exception as e: print("An error occurred:", e) return None def get_subjects(self): return self.catalog['Subjects'].str.split('; ').explode().unique().tolist() def random_subjects(self, n, seed): """ This method returns n random subjects. Parameters: - n (int): number of subjects. - seed (int): random seed for reproducibility. Returns: - list: Random sample of subjects from the Gutenberg Books catalog following the subjects distribution. """ df = self.catalog['Subjects'] subject_counts = df.str.split('; ').explode() \ .groupby(df.str.split('; ').explode()).count() \ .reset_index(name='Count').sort_values(by='Count', ascending=False) \ .rename(columns={"Subjects": "Subject"}) \ .reset_index(drop=True) return subject_counts.sample(n=n, replace=False, random_state=seed, weights=subject_counts['Count']) def topn_subjects(self, n): df = self.catalog['Subjects'] subject_counts = df.str.split('; ').explode() \ .groupby(df.str.split('; ').explode()).count() \ .reset_index(name='Count').sort_values(by='Count', ascending=False) \ .rename(columns={"Subjects": "Subject"}) return subject_counts.reset_index(drop=True).head(n) def get_authors(self): return self.catalog['Authors'].str.split('; ').explode().unique().tolist() def random_authors(self, n, seed): df = self.catalog['Authors'] author_counts = df.str.split('; ').explode() \ .groupby(df.str.split('; ').explode()).count() \ .reset_index(name='Count').sort_values(by='Count', ascending=False) \ .rename(columns={"Authors": "Author"}) \ .reset_index(drop=True) return author_counts.sample(n=n, replace=False, random_state=seed, weights=author_counts['Count']) def topn_authors(self, n): df = self.catalog['Authors'] author_counts = df.str.split('; ').explode() \ .groupby(df.str.split('; ').explode()).count() \ .reset_index(name='Count').sort_values(by='Count', ascending=False) \ .rename(columns={"Authors": "Author"}) return author_counts.reset_index(drop=True).head(n) def get_languages(self): return self.catalog['Language'].str.split('; ').explode().unique().tolist() def topn_languages(self, n): df = self.catalog['Language'] language_counts = df.str.split('; ').explode() \ .groupby(df.str.split('; ').explode()).count() \ .reset_index(name='Count').sort_values(by='Count', ascending=False) return language_counts.reset_index(drop=True).head(n) def get_bookshelves(self): return self.catalog['Bookshelves'].str.split('; ').explode().unique().tolist() def topn_bookshelves(self, n): df = self.catalog['Bookshelves'] bookshelf_counts = df.str.split('; ').explode() \ .groupby(df.str.split('; ').explode()).count() \ .reset_index(name='Count').sort_values(by='Count', ascending=False) \ .rename(columns={"Bookshelves": "Bookshelf"}) return bookshelf_counts.reset_index(drop=True).head(n) def get_types(self): return self.catalog['Type'].unique().tolist() def get_books(self, lang, subject, title): return self.catalog.sample(n=n, replace=False, random_state=seed) def random_books(self, n, seed): return self.catalog.sample(n=n, replace=False, random_state=seed) def books_matching_subject(self, substr): return self.catalog.query(f'Subjects.str.lower().str.contains("{substr.lower()}", na=False)') def books_matching_author(self, substr): return self.catalog.query(f'Author.str.lower().str.contains("{substr.lower()}", na=False)') def books_matching_year(self, given_year): """ Find books from the catalog that match a given year within the birth-death intervals of authors. Parameters: - given_year (int): The year to match within the birth-death intervals of authors. Returns: - DataFrame: A DataFrame containing books from the catalog where the given year falls within the birth-death intervals of authors. This method extracts birth and death years from the 'Authors' column of the catalog and filters rows where the given year is within any birth-death interval. It returns a DataFrame of matching books. """ catalog_copy = self.catalog.copy() # Create a temporary DataFrame to hold split author-interval pairs temp_df = catalog_copy['Authors'].str.extractall(r'((?:\w+\s+)?(?:\d{4})\s*-\s*(?:\d{4}))') temp_df.reset_index(inplace=True) temp_df.rename(columns={0: 'Author_Interval'}, inplace=True) # Merge the original catalog with the temporary DataFrame merged_df = pd.merge(catalog_copy, temp_df, left_index=True, right_on='level_0') # Extract birth and death years from the author-interval pairs merged_df['Birth_Year'] = merged_df['Author_Interval'].str.extract(r'(\d{4})') merged_df['Death_Year'] = merged_df['Author_Interval'].str.extract(r'\d{4}\s*-\s*(\d{4})') # Convert birth and death years to numeric merged_df['Birth_Year'] = pd.to_numeric(merged_df['Birth_Year'], errors='coerce') merged_df['Death_Year'] = pd.to_numeric(merged_df['Death_Year'], errors='coerce') # Filter rows where the given year is within any birth-death interval matching_books = merged_df[(merged_df['Birth_Year'] <= given_year) & (merged_df['Death_Year'] >= given_year)] # Drop unnecessary columns matching_books.drop(columns=['Author_Interval', 'level_0'], inplace=True) # Return matching books return matching_books def download_book(self, nr): """ Download one book from the Gutenberg collection identified by its id. If the book already exists in the cache folder, it is not downloaded again. Parameters: - nr (int): id of the book in the Gutenberg books collection. Returns: - str: the path where the book was downloaded. """ b = str(nr) book = f"pg{b}.txt" url = f"https://www.gutenberg.org/cache/epub/{b}/{book}" book_path = os.path.join(GUTENBERG_CACHEDIR, book) if self.is_book_downloaded(b): print(f"Book {nr} already exists in cache. Not downloading.") else: try: # Retrieve the book from the URL print(f"Retrieving {book} from {url}.") with open(book_path, "w") as f: f.write(requests.get(url).text) except Exception as e: print("An error occurred:", e) return None return book_path def download_books(self, books): """ Download a list of books from the Gutenberg collection. If a book already exists in the cache folder, it is not downloaded again. Parameters: - books (list): list of ids of books in the Gutenberg books collection. Returns: - str: the path where the book was downloaded. """ book_paths = [] for b in books: path =self.download_book(b) book_paths += [path] return book_paths def download_n_books(self, n, subject): """ Download a certain number of books from the Gutenberg collection based on the desired size and subject. If a book already exists in the cache folder, it is not downloaded again. Parameters: - n (int): The number of books to download. - subject (str): The subject to match when selecting books. Returns: - list: A list of paths where the downloaded books are saved. """ # Get books matching the subject matching_books = self.books_matching_subject(subject) # Limit the number of books to download books_to_download = matching_books[:n]['Text#'] # Download books book_paths = [self.download_book(b) for b in books_to_download] return book_paths def download_size_books(self, size_mb=128, subject=None): """ Download books from the Gutenberg collection based on the desired total size and subject. If a book already exists in the cache folder, it is not downloaded again. Parameters: - size_mb (int): The desired total size of downloaded books in MB. Default is 128MB. - subject (str, optional): The subject to match when selecting books. Default is None. Returns: - list: A list of paths where the downloaded books are saved. """ # Get books matching the subject if provided if subject: matching_books = self.books_matching_subject(subject)['Text#'] else: matching_books = self.catalog['Text#'] # Initialize variables total_size = 0 books_to_download = [] # Iterate through matching books until total size threshold is met for b in matching_books: if total_size >= size_mb * 1024 * 1024: # Convert MB to bytes break book_path = self.download_book(b) file_size = os.path.getsize(book_path) # Add file size to total size total_size += file_size # Add book to download list books_to_download.append(b) # Download books book_paths = [self.download_book(b) for b in books_to_download] print(f"Total size: {int(total_size/1024/1024)}MB") if total_size <= size_mb * 1024 * 1024: print(f"Download more books to get {size_mb}MB") return book_paths gb = GutenbergBooks() # ## Use `cache_catalog()` to create a cached copy of the catalog # In[2]: # gb.cache_catalog() # ## Interactive tables # # Library `data_table` from Google Colab adds interactivity to Pandas tables. # # https://colab.research.google.com/notebooks/data_table.ipynb # In[3]: # true if running on Google Colab import sys IN_COLAB = 'google.colab' in sys.modules if IN_COLAB: from google.colab import data_table from vega_datasets import data data_table.enable_dataframe_formatter() else: get_ipython().system('pip install itables') from itables import init_notebook_mode init_notebook_mode(all_interactive=True) # ## Code for visualizations # # This is needed for plotting. # In[4]: import matplotlib colors = matplotlib.cm.tab20(range(20)) # source: https://matplotlib.org/stable/gallery/misc/packed_bubbles.html import matplotlib.pyplot as plt import numpy as np class BubbleChart: def __init__(self, area, bubble_spacing=0): """ Setup for bubble collapse. Parameters ---------- area : array-like Area of the bubbles. bubble_spacing : float, default: 0 Minimal spacing between bubbles after collapsing. Notes ----- If "area" is sorted, the results might look weird. """ area = np.asarray(area) r = np.sqrt(area / np.pi) self.bubble_spacing = bubble_spacing self.bubbles = np.ones((len(area), 4)) self.bubbles[:, 2] = r self.bubbles[:, 3] = area self.maxstep = 2 * self.bubbles[:, 2].max() + self.bubble_spacing self.step_dist = self.maxstep / 2 # calculate initial grid layout for bubbles length = np.ceil(np.sqrt(len(self.bubbles))) grid = np.arange(length) * self.maxstep gx, gy = np.meshgrid(grid, grid) self.bubbles[:, 0] = gx.flatten()[:len(self.bubbles)] self.bubbles[:, 1] = gy.flatten()[:len(self.bubbles)] self.com = self.center_of_mass() def center_of_mass(self): return np.average( self.bubbles[:, :2], axis=0, weights=self.bubbles[:, 3] ) def center_distance(self, bubble, bubbles): return np.hypot(bubble[0] - bubbles[:, 0], bubble[1] - bubbles[:, 1]) def outline_distance(self, bubble, bubbles): center_distance = self.center_distance(bubble, bubbles) return center_distance - bubble[2] - \ bubbles[:, 2] - self.bubble_spacing def check_collisions(self, bubble, bubbles): distance = self.outline_distance(bubble, bubbles) return len(distance[distance < 0]) def collides_with(self, bubble, bubbles): distance = self.outline_distance(bubble, bubbles) return np.argmin(distance, keepdims=True) def collapse(self, n_iterations=50): """ Move bubbles to the center of mass. Parameters ---------- n_iterations : int, default: 100 Number of moves to perform. """ for _i in range(n_iterations): moves = 0 for i in range(len(self.bubbles)): rest_bub = np.delete(self.bubbles, i, 0) # try to move directly towards the center of mass # direction vector from bubble to the center of mass dir_vec = self.com - self.bubbles[i, :2] # shorten direction vector to have length of 1 dir_vec = dir_vec / np.sqrt(dir_vec.dot(dir_vec)) # calculate new bubble position new_point = self.bubbles[i, :2] + dir_vec * self.step_dist new_bubble = np.append(new_point, self.bubbles[i, 2:4]) # check whether new bubble collides with other bubbles if not self.check_collisions(new_bubble, rest_bub): self.bubbles[i, :] = new_bubble self.com = self.center_of_mass() moves += 1 else: # try to move around a bubble that you collide with # find colliding bubble for colliding in self.collides_with(new_bubble, rest_bub): # calculate direction vector dir_vec = rest_bub[colliding, :2] - self.bubbles[i, :2] dir_vec = dir_vec / np.sqrt(dir_vec.dot(dir_vec)) # calculate orthogonal vector orth = np.array([dir_vec[1], -dir_vec[0]]) # test which direction to go new_point1 = (self.bubbles[i, :2] + orth * self.step_dist) new_point2 = (self.bubbles[i, :2] - orth * self.step_dist) dist1 = self.center_distance( self.com, np.array([new_point1])) dist2 = self.center_distance( self.com, np.array([new_point2])) new_point = new_point1 if dist1 < dist2 else new_point2 new_bubble = np.append(new_point, self.bubbles[i, 2:4]) if not self.check_collisions(new_bubble, rest_bub): self.bubbles[i, :] = new_bubble self.com = self.center_of_mass() if moves / len(self.bubbles) < 0.1: self.step_dist = self.step_dist / 2 def plot(self, ax, labels, colors): """ Draw the bubble plot. Parameters ---------- ax : matplotlib.axes.Axes labels : list Labels of the bubbles. colors : list Colors of the bubbles. """ for i in range(len(self.bubbles)): circ = plt.Circle( self.bubbles[i, :2], self.bubbles[i, 2], color=colors[i]) ax.add_patch(circ) ax.text(*self.bubbles[i, :2], labels[i], horizontalalignment='center', verticalalignment='center') # Attempt to set the font family desired_font_family = 'DejaVu Serif' try: plt.rcParams['font.family'] = desired_font_family print(f"Using '{desired_font_family}' font family.") except: print(f"Warning: Font family '{desired_font_family}' not found. Using fallback font.") plt.rcParams['font.family'] = 'serif' # Fallback to a generic serif font # # Explore # ## Books # ### All books # # The whole Gutenberg collection catalog is saved in the `catalog` of the `GutenbergBooks` object `gb`. # In[5]: gb.catalog # ### Count books in the collection # # There are currently $73109$ books in the collection. # In[6]: len(gb.catalog) # ### First five books in the catalog # In[7]: gb.catalog.head(5) # ### Five random books # # Looking only at the first lines of a DataFrame might provide an initial glimpse into the data, but it can be insufficient for gaining a comprehensive understanding of its characteristics, that's why sampling from the DataFrame is often more beneficial. # # So, let's break away from the norm of quickly scanning the first few lines of a file with the `head` command. Let us instead allocate a bit more computational power and extract a small yet representative sample of the data. # In[8]: print("Five random books from catalog") gb.random_books(n=5, seed=42) # ## Subjects # ### Count distinct subjects # # There are currently $39619$ distinct subjects. # In[9]: len(gb.get_subjects()) # ### Top $n$ subjects # In[10]: n = 10 gb.topn_subjects(n) # ### 20K subjects # In[11]: pd.DataFrame(gb.topn_subjects(20000)) # Limiting the number of rows to 20000 because this is the maximum number supported # by Colab's `data_table`. # ### Ten random subjects # In[12]: gb.random_subjects(10, 42).sort_values(by='Count', ascending=False) # ### List books matching a given subject # # Change the subject by setting the variable `my_subject` (search is case-insensitive). # In[13]: substr = "description and travel" gb.books_matching_subject(substr).head() # ### Visualize most frequent subjects # In[14]: n = 20 gutenberg_books_subjects = { 'subjects': gb.topn_subjects(n)['Subject'].replace({' -- ': '\n'}, regex=True).to_list(), 'market_share': list(map(lambda x: x*n*3, gb.topn_subjects(n)['Count'].to_list())), 'color': colors[:n] } bubble_chart = BubbleChart(area=gutenberg_books_subjects['market_share'], bubble_spacing=2*n) bubble_chart.collapse() fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"), figsize=(10, 10)) bubble_chart.plot( ax, gutenberg_books_subjects['subjects'], gutenberg_books_subjects['color']) ax.axis("off") ax.relim() ax.autoscale_view() ax.set_title(f'Gutenberg books top {n} subjects') plt.show() # ## Authors # ### Count distinct authors # # There are currently $37392$ distinct authors. # In[15]: len(gb.get_authors()) # ### All authors # The `data_table` library can only deal with a maximum of $20000$ rows. If the number of rows exceeds this limit, the usual Pandas display is used (with no interactivity). # # In[16]: pd.DataFrame(gb.get_authors()) # ### Top $n$ authors # In[17]: n = 20000 gb.topn_authors(n) # ### Ten random authors # In[18]: gb.random_authors(10, 42).sort_values(by='Count', ascending=False) # ### Visualize most frequent authors # In[19]: n = 20 gutenberg_books_authors = { 'authors': gb.topn_authors(n)['Author'].replace({', ': '\n', ' \[': '\n['}, regex=True).to_list(), 'market_share': list(map(lambda x: x*n*3, gb.topn_authors(n)['Count'].to_list())), 'color': colors[:n] } bubble_chart = BubbleChart(area=gutenberg_books_authors['market_share'], bubble_spacing=2*n) bubble_chart.collapse() fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"),figsize=(10, 10)) bubble_chart.plot( ax, gutenberg_books_authors['authors'], gutenberg_books_authors['color']) ax.axis("off") ax.relim() ax.autoscale_view() ax.set_title(f'Gutenberg books top {n} authors') plt.show() # ## Types # ### All types # In[20]: pd.DataFrame(gb.get_types(), columns=['Type']) # ### Count books by types # In[21]: grouped_counts = gb.catalog.groupby('Type').size().reset_index(name='Count') grouped_counts # ### Visualize types # In[22]: grouped_data = gb.catalog.groupby('Type').size().reset_index(name='Count') n = len(grouped_data) # Extracting values of 'Type' and 'Count' columns as lists type_list = grouped_data['Type'].tolist() count_list = grouped_data['Count'].tolist() gutenberg_books_types = { 'types': type_list, # adapt the size of smaller items 'market_share': list(map(lambda x: x if x>1000 else x*n*10, count_list)), 'color': colors[:-n] } bubble_chart = BubbleChart(area=gutenberg_books_types['market_share'], bubble_spacing=2*n) bubble_chart.collapse() fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"),figsize=(10, 10)) bubble_chart.plot( ax, gutenberg_books_types['types'], gutenberg_books_types['color']) ax.axis("off") ax.relim() ax.autoscale_view() ax.set_title(f'Gutenberg books top types') subtitle = "(the depicted proportions have been altered and do not reflect the true distribution)" # Set the subtitle below the main title plt.text(0.5, 0.98, subtitle, fontsize=10, ha='center', transform=plt.gca().transAxes) plt.show() # I wasn't aware that the Gutenberg collection contained data other than text. I'll need to explore these additional data types at some point. # ## Bookshelves # ### Top $n$ bookshelves # In[23]: n = 10 gb.topn_bookshelves(n) # ### Visualize most frequent bookshelves # In[24]: n = 20 gutenberg_books_bookshelves = { 'bookshelves': gb.topn_bookshelves(n)['Bookshelf'].replace({', ': '\n'}, regex=True).to_list(), 'market_share': list(map(lambda x: x*n*3, gb.topn_bookshelves(n)['Count'].to_list())), 'color': colors[:n] } bubble_chart = BubbleChart(area=gutenberg_books_bookshelves['market_share'], bubble_spacing=4*n) bubble_chart.collapse() fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"), figsize=(10, 10)) bubble_chart.plot( ax, gutenberg_books_bookshelves['bookshelves'], gutenberg_books_bookshelves['color']) ax.axis("off") ax.relim() ax.autoscale_view() ax.set_title(f'Gutenberg books top {n} bookshelves') plt.show() # ### Books without bookshelf # # Many books do not belong to any bookshelf # In[25]: gb.catalog.count() # ### Number of books without bookshelf # In[26]: print(f"Number of books with no bookshelf: {gb.catalog[gb.catalog['Bookshelves'].isna()].shape[0]}") # ### Five random books without bookshelf # In[27]: gb.catalog[gb.catalog['Bookshelves'].isna()].sample(n=5, replace=False, random_state=42) # ## Languages # ### Count distinct languages # # The Gutenberg collection currently comprises 68 languages. # In[28]: len(gb.get_languages()) # ### Top $n$ languages # In[29]: gb.topn_languages(10) # ### Visualize top $n$ languages # In[30]: n = 20 gutenberg_books_languages = { 'languages': gb.topn_languages(n)['Language'].to_list(), 'market_share': list(map(lambda x: x*10, gb.topn_languages(n)['Count'].to_list())), 'color': colors } bubble_chart = BubbleChart(area=gutenberg_books_languages['market_share'], bubble_spacing=15) bubble_chart.collapse() fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"), figsize=(10, 10)) bubble_chart.plot( ax, gutenberg_books_languages['languages'], gutenberg_books_languages['color']) ax.axis("off") ax.relim() ax.autoscale_view() ax.set_title(f'Gutenberg books top {n} languages') plt.show() # ## Match books using various criteria # ### Match books by subject # In[31]: substr = "description and travel" gb.books_matching_subject(substr) # ### Match books by year # In[32]: gb.books_matching_year(1984) # In[33]: help(GutenbergBooks.books_matching_year) # # Downloading files from the Gutenberg collection # # ⚠️ Please read carefully this notice about the Gutenberg Project's policies on bulk downloading: # # # # > “ _The Project Gutenberg website is intended for human users only. Any perceived use of automated tools to access the Project Gutenberg website will result in a temporary or permanent block of your IP address._ ” # # See: https://www.gutenberg.org/policy/robot_access.html. # # ## The cache directory # # By default, `GutenbergBooks` is the directory where all downloaded books are stored. If a book is alredy in the `GutenbergBooks` directory it won't be downloaded again. # # ⚠️ The cache directory is empty when you start your Google Colab session! ⚠️ # ## Download one book # In[34]: gb.download_book(5687) # The same book won't be downloaded because it already exists in the cache directory `GutenbergBooks`. # In[35]: gb.download_book(5687) # In[36]: help(GutenbergBooks.download_book) # ## Download multiple books # In[37]: gb.download_books([5678, 5679, 5680]) # In[38]: help(GutenbergBooks.download_books) # ## Download $n$ books by subject # In[39]: gb.download_n_books(5, "\(South Africa\) -- Description and travel") # ## Download a given amount of books by subject # # DOwnload books matching a certain subject. Stop when the threshold given by the `size_mb` (size in Megabytes) parameter is reached. # # If not specified, `size_mb` is $128$ (the default Hadoop block size). # In[40]: gb.download_size_books(subject="\(South Africa\) -- Description and travel") # In[41]: get_ipython().system('du -sh GutenbergBooks') # In[42]: subject = "United States -- Description and travel" gb.download_size_books(size_mb=90, subject=subject) # In[43]: get_ipython().system('du -sh GutenbergBooks') # It's not easy to get enough data! # In[44]: subject = "California -- Description and travel" gb.download_size_books(size_mb=50, subject=subject) # In[45]: get_ipython().system('du -sh GutenbergBooks') # # Acknowledgements and some thoughts on Artificial Intelligence # # For this tutorial I've made extensive use of the ChatGPT (version $3.5$) AI to: # - improve my English # - define code structure # - write Python code snippets # - document code # # I ideated, organized, adapted, double-checked all content (both text and code) with the aim of creating a useful tool for exploring the Gutenberg books collection and providing a pleasant user experience. # # I can imagine that in the future AI will be able to write such tutorials on their own and then the role of a tutorial author will be limited to defining requirements. Maybe there are going to be self-adapting tutorials that create themselves on the spot according to the needs of the readers, thus bypassing the need for tutorial authors. What are tutorial authors going to do then? Something else 😀! # # # In this spirit, I'd like to thank everyone who contributed to the common sense language collection (both natural and programming languages) used to train ChatGPT, the creators of ChatGPT, and the companies making it available as a comfortable Web application.