#!/usr/bin/env python # coding: utf-8 # # Encoding # # ## Goals # # - A string is more than a sequence of bytes # - A string is a couple (bytes, encoding) # - Use unicode_literals in python2 # - Manage differently encoded filenames # - A string is not a sequence of bytes # ## Modules # In[ ]: import os import os.path import glob # In[ ]: from os.path import isdir basedir = "/tmp/course" if not isdir(basedir): os.makedirs(basedir) # # Song of Childhood # # # # # # #
# Als das Kind Kind war, # ging es mit hängenden Armen, # wollte der Bach sei ein Fluß, # der Fluß sei ein Strom, # und diese Pfütze das Meer. # # Als das Kind Kind war, # wußte es nicht, daß es Kind war, # alles war ihm beseelt, # und alle Seelen waren eins. # # #

# "When the child was a child,
# characters were bytes, and
# strings list of bytes."
#

#
# When the child was a child # It walked with its arms swinging, # wanted the brook to be a river, # the river to be a torrent, # and this puddle to be the sea. # # When the child was a child, # it didn’t know that it was a child, # everything was soulful, # and all souls were one. # #
# # Encoding is a map # # Encoding is a map between typographical characters and byte-sequences. # # Decoding is its reverse map. # # # |char -> | utf8 | cp1252 | ascii | # |-------------|--------|---| # |y -> | [121] | [121] | [121] | # |z -> | [122] | [122] | [122] | # |{ -> | [123] | [123] | [123] | # |¢ -> | [194, 162] | [162] | - | # |£ -> | [194, 163] | [163] | - | # |¤ -> | [194, 164] | [164] | - | # |¥ -> | [194, 165] | [165] | - | # |Ɓ -> | [198, 129] | - | - | # |Ƃ -> | [198, 130] | - | - | # |ƃ -> | [198, 131] | - | - | # # In[ ]: # Py3 doesn't need the 'u' prefix before the string. the_string = u"S\u00fcd" # Sued print(the_string) # In[ ]: # the_string Sued can be encoded in different... in_utf8 = the_string.encode('utf-8') in_win = the_string.encode('cp1252') # ...byte-sequences assert type(in_utf8) == bytes # In[ ]: # Now you can see the differences between print(repr(in_utf8)) # and print(repr(in_win)) # In[ ]: # Decoding bytes using the wrong map... # ...gives Süd results print(in_utf8.decode('cp1252')) # In[ ]: # Filenames are actually binary data # we should be careful when our scripts read # eg from a vfat filesystem. # To make Py2 encoding-aware we must from __future__ import unicode_literals, print_function # Create 3 windows-encoded filenames in # using the provided function from course import create_espana create_espana(basedir) # In[ ]: # Just list the newly created files # and check that they are not showing correctly (unless we have windows :D) get_ipython().system('dir {basedir}') # In[ ]: from glob import glob as ls #expands wildcards like ls # To avoid encoding issue like the following... files = ls("/tmp/course/*.txt") #UnicodeDecodeError: 'ascii' codec can't decode # byte 0xe9 in position 5: # remember ñ in cp1252 # ordinal not in range(128) # In[ ]: # We must explicitly use bytes prefixing with "b" files = ls(b"/tmp/course/*.txt") # And the file names are shown with bytes. print(files) # In[ ]: # Exercise: don't run this cell! # Which outcome do you expect from the following instruction? print('\n'.join(files)) # In[ ]: