Python unicode cheatsheet
Encode: unicode code point to bytes
> >> s = u'Café'
> >> type(s .encode ('utf-8' ))
< class 'bytes' >
Decode: bytes to unicode code point
> >> s = bytes ('Café' , encoding = 'utf-8' )
> >> s .decode ('utf-8' )
'Café'
> >> s = u'Café'
> >> for _c in s : print ('U+%04x' % ord (_c ))
...
U + 0043
U + 0061
U + 0066
U + 00e9
> >> u = '中文'
> >> for _c in u : print ('U+%04x' % ord (_c ))
...
U + 4e2 d
U + 6587
python2 str
is equivalent to byte string
> >> s = 'Café' # byte string
> >> s
'Caf\xc3 \xa9 '
> >> type(s )
< type 'str' >
> >> u = u'Café' # unicode string
> >> u
u'Caf\xe9 '
> >> type(u )
< type 'unicode' >
python3 str
is equivalent to unicode string
> >> s = 'Café'
> >> type(s )
< class 'str' >
> >> s
'Café'
>> > s .encode ('utf-8' )
b'Caf\xc3 \xa9 '
>> > s .encode ('utf-8' ).decode ('utf-8' )
'Café'
python2 take str
char as byte character
> >> s = 'Café'
> >> print [_c for _c in s ]
['C' , 'a' , 'f' , '\xc3 ' , '\xa9 ' ]
> >> len (s )
5
> >> s = u'Café'
> >> print [_c for _c in s ]
[u'C' , u'a' , u'f' , u'\xe9 ' ]
> >> len (s )
4
python3 take str
char as unicode character
> >> s = 'Café'
> >> print ([_c for _c in s ])
['C' , 'a' , 'f' , 'é' ]
> >> len (s )
4
> >> bs = bytes (s , encoding = 'utf-8' )
> >> print (bs )
b'Caf\xc3 \xa9 '
> >> len (bs )
5
# python 3
> >> u1 = 'Café' # unicode string
> >> u2 = 'Cafe\u0301 '
> >> u1 , u2
('Café' , 'Café' )
>> > len (u1 ), len (u2 )
(4 , 5 )
>> > u1 == u2
False
>> > u1 .encode ('utf-8' ) # get u1 byte string
b'Caf\xc3 \xa9 '
>> > u2 .encode ('utf-8' ) # get u2 byte string
b'Cafe\xcc \x81 '
>> > from unicodedata import normalize
> >> s1 = normalize ('NFC' , u1 ) # get u1 NFC format
> >> s2 = normalize ('NFC' , u2 ) # get u2 NFC format
> >> s1 == s2
True
> >> s1 .encode ('utf-8' ), s2 .encode ('utf-8' )
(b'Caf\xc3 \xa9 ' , b'Caf\xc3 \xa9 ' )
> >> s1 = normalize ('NFD' , u1 ) # get u1 NFD format
> >> s2 = normalize ('NFD' , u2 ) # get u2 NFD format
> >> s1 , s2
('Café' , 'Café' )
>> > s1 == s2
True
>> > s1 .encode ('utf-8' ), s2 .encode ('utf-8' )
(b'Cafe\xcc \x81 ' , b'Cafe\xcc \x81 ' )