We will beusing The Internet Archive Python Library to get the text data of the book Die Märchen Der Gebrüder Grimm
In particular, we need the text format of that book which can be downloaded using the download
of the library. We need to provide
The identifier of that book can be found on the book page by looking at a field called "Identifier". In our case, the identifier is die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716
The format has a value that must be obtained by looking at the metadata of that book. Let's retrieve it first:
import json
from internetarchive import get_item
BOOK_IDENTIFIER="die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716"
item = get_item(BOOK_IDENTIFIER)
print(json.dumps(item.item_metadata, indent=4))
{ "created": 1722929206, "d1": "ia903208.us.archive.org", "d2": "ia803208.us.archive.org", "dir": "/22/items/die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716", "files": [ { "name": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716.epub", "source": "derivative", "original": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_abbyy.gz", "mtime": "1699549449", "size": "2965462", "md5": "0e71e73d5b3ca5df4600d2c2b2c67028", "crc32": "e9c10c67", "sha1": "9f40b6573ed89e778875bc3f3f1320e2639235db", "format": "EPUB" }, { "name": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716.pdf", "source": "original", "mtime": "1594875306", "size": "148426700", "md5": "0db2a144140c098ac6253c57a71d9bd8", "crc32": "cbeb0e2b", "sha1": "629597f9a7c1009057341ea32036a11fb7555513", "format": "Image Container PDF" }, { "name": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_abbyy.gz", "source": "derivative", "format": "Abbyy GZ", "original": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_jp2.zip", "mtime": "1594893532", "size": "23386869", "md5": "29b8942d0a6dad44ccecd2c5121d1283", "crc32": "5e5b4d67", "sha1": "c82e7154d2045b3c59d2d5b925894c013f258585" }, { "name": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_djvu.txt", "source": "derivative", "format": "DjVuTXT", "original": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_djvu.xml", "mtime": "1594894478", "size": "1611041", "md5": "7734ed03481d4ec3d77b846da8968399", "crc32": "a3fc45df", "sha1": "0a8316a226477954453ddac48cda997a597e6a2d" }, { "name": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_djvu.xml", "source": "derivative", "format": "Djvu XML", "original": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_abbyy.gz", "mtime": "1594894448", "size": "15321875", "md5": "6fa6871a615c5839d343ad711df388ba", "crc32": "a1f96a26", "sha1": "435f4fe43bb03e291f09b429d13476175f043b13" }, { "name": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_jp2.zip", "source": "derivative", "format": "Single Page Processed JP2 ZIP", "original": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716.pdf", "mtime": "1594888035", "size": "434155240", "md5": "565cfb79efc0098c2c7faf1d70bd41bd", "crc32": "b1cd82a6", "sha1": "f68aee4cc89dad4efa4968201d841a301b6f8b43", "filecount": "630" }, { "name": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_page_numbers.json", "source": "derivative", "format": "Page Numbers JSON", "original": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_djvu.xml", "mtime": "1594908526", "size": "115749", "md5": "441528fddc4d1d2a324d96b9d8bf7a83", "crc32": "a227200c", "sha1": "f0eaf0173439d1aafec792641cdb8061ce096c67" }, { "name": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_scandata.xml", "source": "derivative", "format": "Scandata", "original": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_djvu.xml", "mtime": "1594908528", "size": "210916", "md5": "7f8be4f82c9dcd466e6952dbd13e6857", "crc32": "4489685d", "sha1": "c700a50b1e132e4255adf697473591876076db58" }, { "name": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_text.pdf", "source": "derivative", "format": "Additional Text PDF", "original": "Die M\u00e4rchen der Gebr\u00fcder Grimm_Kinder- und Hausm\u00e4rchen_141 MBytes_20200716_abbyy.gz", "mtime": "1594908492", "size": "57348160", "md5": "f4d53dbc0a9a83df4dac87d8b9345fb2", "crc32": "97211be1", "sha1": "3742f326aa747715491834c2ac0cd117bc98d43f" }, { "name": "__ia_thumb.jpg", "source": "original", "mtime": "1699549450", "size": "24968", "md5": "031971f7396aed65e00ce754414abbc4", "crc32": "deb5696d", "sha1": "d54f5dedf0438277cef97d00d908f95f08860e76", "format": "Item Tile", "rotation": "0" }, { "name": "die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716_archive.torrent", "source": "metadata", "btih": "581a0c4cdaac082d938d4aebafec989685aa0f6b", "mtime": "1699549454", "size": "29312", "md5": "14541c25978cc2977f09b08ea66bf6ce", "crc32": "a9048f6b", "sha1": "5854c6147fc2ecb07274b424fde6aaf1a8b08c6c", "format": "Archive BitTorrent" }, { "name": "die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716_files.xml", "source": "original", "format": "Metadata", "md5": "ff800b949380673efa6d2a09069bf1e1", "summation": "md5" }, { "name": "die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716_meta.sqlite", "source": "original", "mtime": "1594875385", "size": "14336", "md5": "88b1069e4783d828f3436309eef8dbc8", "crc32": "a8763901", "sha1": "5eed633c021c84b6934d516c9a5ef67ccc159979", "format": "Metadata" }, { "name": "die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716_meta.xml", "source": "original", "mtime": "1613299427", "size": "1431", "md5": "e9d13f6639a11907a7375840e6b292d6", "crc32": "44a88f84", "sha1": "d7c8c5b630197adfad568229012276b7978c1d60", "format": "Metadata" } ], "files_count": 14, "item_last_updated": 1699549454, "item_size": 683612059, "metadata": { "identifier": "die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716", "mediatype": "texts", "collection": [ "opensource", "community" ], "creator": "Jakob und Wilhelm Grimm", "date": "1819-01-01", "description": "Die M\u00e4rchen der Gebr\u00fcder Grimm. Kinder- und Hausm\u00e4rchen. 141 MBytes. 16-07-2020.", "language": "ger", "licenseurl": "https://creativecommons.org/licenses/by-nd/4.0/", "scanner": "Internet Archive HTML5 Uploader 1.6.4", "subject": [ "Gebr\u00fcder Grimm", "M\u00e4rchen", "Kinderm\u00e4rchen", "Hausm\u00e4rchen" ], "title": "Die M\u00e4rchen Der Gebr\u00fcder Grimm Kinder Und Hausm\u00e4rchen 141 MBytes 20200716", "uploader": "iluagaril@gmail.com", "publicdate": "2020-07-16 04:56:00", "addeddate": "2020-07-16 04:56:00", "curation": "[curator]validator@archive.org[/curator][date]20200716045655[/date][comment]checked for malware[/comment]", "identifier-access": "http://archive.org/details/die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716", "identifier-ark": "ark:/13960/t00093c9q", "ppi": "600", "ocr": "ABBYY FineReader 11.0 (Extended OCR)", "page_number_confidence": "96.03" }, "server": "ia903208.us.archive.org", "uniq": 981412115, "workable_servers": [ "ia903208.us.archive.org", "ia803208.us.archive.org" ] }
We see that the format is is "DjVuTXT". So lets download our book now:
BOOK_IDENTIFIER="die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716"
from internetarchive import download
download(BOOK_IDENTIFIER, verbose=True, formats="DjVuTXT")
die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716: skipping die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716/Die Märchen der Gebrüder Grimm_Kinder- und Hausmärchen_141 MBytes_20200716_djvu.txt, file already exists based on length and date.
[]
We will use word_cloud to get a sense of what the most frequently used words in the context of storytelling:
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
from wordcloud import WordCloud, STOPWORDS
# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
# Read the whole text.
text = open(path.join(d, 'die-marchen-der-gebruder-grimm-kinder-und-hausmarchen-141-mbytes-20200716/Die Märchen der Gebrüder Grimm_Kinder- und Hausmärchen_141 MBytes_20200716_djvu.txt')).read()
# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
alice_mask = np.array(Image.open(path.join(d, "img/alice_mask.png")))
stopwords = set(STOPWORDS)
stopwords.add("said")
wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
stopwords=stopwords, contour_width=3, contour_color='steelblue')
# generate word cloud
wc.generate(text)
# store to file
wc.to_file(path.join(d, "alice.png"))
# show
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis("off")
plt.show()
Those who have read "Five Stars: The Communication Secrets to Get from Good to Great" might still remember the Hemingway App mentioned in the book. Linguistically, it's calculating the Gunning fog index. The index, however, works only for English
The formula, on the other hand, involves 2 factors:
The length distribution of the sentences can be calculated and visualized as follows:
sentences = text.split('. ')
sentence_lenghts = [len(i.split()) for i in sentences]
sentence_lenghts.sort(reverse=True)
data = np.array(sentence_lenghts)
plt.hist(data, bins=100)
plt.xlabel("Length of Sentence")
plt.ylabel("Frequency")
plt.title("The Length Distribution of the Sentences")
plt.show()