Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 77 additions & 30 deletions Lib/email/charset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
'add_codec',
]

import codecs
from functools import partial

import email.base64mime
Expand Down Expand Up @@ -61,40 +62,75 @@
'utf-8': (SHORTEST, BASE64, 'utf-8'),
}

# Aliases for other commonly-used names for character sets. Map
# them to the real ones used in email.
# Map Python codec names to their corresponding MIME/IANA names.
ALIASES = {
'latin_1': 'iso-8859-1',
'ascii': 'us-ascii',
'big5hkscs': 'big5-hkscs',
'cp037': 'ibm037',
'cp1026': 'ibm1026',
'cp1140': 'ibm01140',
'cp1250': 'windows-1250',
'cp1251': 'windows-1251',
'cp1252': 'windows-1252',
'cp1253': 'windows-1253',
'cp1254': 'windows-1254',
'cp1255': 'windows-1255',
'cp1256': 'windows-1256',
'cp1257': 'windows-1257',
'cp1258': 'windows-1258',
'cp273': 'ibm273',
'cp424': 'ibm424',
'cp437': 'ibm437',
'cp500': 'ibm500',
'cp775': 'ibm775',
'cp850': 'ibm850',
'cp852': 'ibm852',
'cp855': 'ibm855',
'cp857': 'ibm857',
'cp858': 'ibm00858',
'cp860': 'ibm860',
'cp861': 'ibm861',
'cp862': 'ibm862',
'cp863': 'ibm863',
'cp864': 'ibm864',
'cp865': 'ibm865',
'cp866': 'ibm866',
'cp869': 'ibm869',
'cp874': 'windows-874',
'euc_jp': 'euc-jp',
'euc_kr': 'euc-kr',
'hz': 'hz-gb-2312',
'iso2022_jp': 'iso-2022-jp',
'iso2022_jp_2': 'iso-2022-jp-2',
'iso2022_kr': 'iso-2022-kr',
'iso8859-1': 'iso-8859-1',
'iso8859-10': 'iso-8859-10',
'iso8859-11': 'iso-8859-11',
'iso8859-13': 'iso-8859-13',
'iso8859-14': 'iso-8859-14',
'iso8859-15': 'iso-8859-15',
'iso8859-16': 'iso-8859-16',
'iso8859-2': 'iso-8859-2',
'iso8859-3': 'iso-8859-3',
'iso8859-4': 'iso-8859-4',
'iso8859-5': 'iso-8859-5',
'iso8859-6': 'iso-8859-6',
'iso8859-7': 'iso-8859-7',
'iso8859-8': 'iso-8859-8-i',
'iso8859-9': 'iso-8859-9',
'kz1048': 'kz-1048',
'mac-roman': 'macintosh',

# latin-1 and iso-8859-1 are different codecs in Python.
'latin-1': 'iso-8859-1',
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The latin-1 codec is builtin, whereas the iso8859_1 one is generated from the Unicode mapping sets. They both implement the same codec, though, so the comment is misleading.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is why we need this mapping. Because codecs.lookup() returns different CodecInfo objects with different names. I think that it would be better to make latin1 a simple alias of iso-8859-1, but this is a different issue.

How would you write a comment?

'latin_2': 'iso-8859-2',
'latin-2': 'iso-8859-2',
'latin_3': 'iso-8859-3',
'latin-3': 'iso-8859-3',
'latin_4': 'iso-8859-4',
'latin-4': 'iso-8859-4',
'latin_5': 'iso-8859-9',
'latin-5': 'iso-8859-9',
'latin_6': 'iso-8859-10',
'latin-6': 'iso-8859-10',
'latin_7': 'iso-8859-13',
'latin-7': 'iso-8859-13',
'latin_8': 'iso-8859-14',
'latin-8': 'iso-8859-14',
'latin_9': 'iso-8859-15',
'latin-9': 'iso-8859-15',
'latin_10':'iso-8859-16',
'latin-10':'iso-8859-16',
'cp949': 'ks_c_5601-1987',
'euc_jp': 'euc-jp',
'euc_kr': 'euc-kr',
'ascii': 'us-ascii',
}
# CP949 is not registered in IANA. KS_C_5601-1987 is not the same,
# but the closest registered option.
'cp949': 'ks_c_5601-1987',
}


# Map charsets to their Unicode codec strings.
CODEC_MAP = {
'gb2312': 'eucgb2312_cn',
'big5': 'big5_tw',
# Hack: We don't want *any* conversion for stuff marked us-ascii, as all
# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
# Let that stuff pass through without conversion to/from Unicode.
Expand Down Expand Up @@ -217,7 +253,18 @@ def __init__(self, input_charset=DEFAULT_CHARSET):
raise errors.CharsetError(input_charset)
input_charset = input_charset.lower()
# Set the input charset after filtering through the aliases
self.input_charset = ALIASES.get(input_charset, input_charset)
# For backward compatibility, try ALIASES first to let the user
# override it.
if input_charset in ALIASES:
input_charset = ALIASES[input_charset]
else:
try:
input_codec = codecs.lookup(input_charset).name
except LookupError:
pass
else:
input_charset = ALIASES.get(input_codec, input_codec)
self.input_charset = input_charset
# We can try to guess which encoding and conversion to use by the
# charset_map dictionary. Try that first, but let the user override
# it.
Expand All @@ -228,7 +275,7 @@ def __init__(self, input_charset=DEFAULT_CHARSET):
# Set the attributes, allowing the arguments to override the default.
self.header_encoding = henc
self.body_encoding = benc
self.output_charset = ALIASES.get(conv, conv)
self.output_charset = conv
# Now set the codecs. If one isn't defined for input_charset,
# guess and try a Unicode codec with the same name as input_codec.
self.input_codec = CODEC_MAP.get(self.input_charset,
Expand Down
126 changes: 126 additions & 0 deletions Lib/test/test_email/test_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -4970,6 +4970,127 @@ def tearDown(self):
except KeyError:
pass

def test_attributes(self):
from email import charset
c = Charset()
self.assertEqual(c.input_charset, 'us-ascii')
self.assertEqual(c.header_encoding, None)
self.assertEqual(c.body_encoding, None)
self.assertEqual(c.output_charset, 'us-ascii')
self.assertEqual(c.input_codec, None)
self.assertEqual(c.output_codec, None)

c = Charset('us-ascii')
self.assertEqual(c.input_charset, 'us-ascii')
self.assertEqual(c.header_encoding, None)
self.assertEqual(c.body_encoding, None)
self.assertEqual(c.output_charset, 'us-ascii')
self.assertEqual(c.input_codec, None)
self.assertEqual(c.output_codec, None)

c = Charset('utf8')
self.assertEqual(c.input_charset, 'utf-8')
self.assertEqual(c.header_encoding, charset.SHORTEST)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'utf-8')
self.assertEqual(c.input_codec, 'utf-8')
self.assertEqual(c.output_codec, 'utf-8')

c = Charset('latin1')
self.assertEqual(c.input_charset, 'iso-8859-1')
self.assertEqual(c.header_encoding, charset.QP)
self.assertEqual(c.body_encoding, charset.QP)
self.assertEqual(c.output_charset, 'iso-8859-1')
self.assertEqual(c.input_codec, 'iso-8859-1')
self.assertEqual(c.output_codec, 'iso-8859-1')

c = Charset('latin9')
self.assertEqual(c.input_charset, 'iso-8859-15')
self.assertEqual(c.header_encoding, charset.QP)
self.assertEqual(c.body_encoding, charset.QP)
self.assertEqual(c.output_charset, 'iso-8859-15')
self.assertEqual(c.input_codec, 'iso-8859-15')
self.assertEqual(c.output_codec, 'iso-8859-15')

c = Charset('cyrillic')
self.assertEqual(c.input_charset, 'iso-8859-5')
self.assertEqual(c.header_encoding, charset.SHORTEST)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'iso-8859-5')
self.assertEqual(c.input_codec, 'iso-8859-5')
self.assertEqual(c.output_codec, 'iso-8859-5')

c = Charset('cp1251')
self.assertEqual(c.input_charset, 'windows-1251')
self.assertEqual(c.header_encoding, charset.SHORTEST)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'windows-1251')
self.assertEqual(c.input_codec, 'windows-1251')
self.assertEqual(c.output_codec, 'windows-1251')

c = Charset('cp1252')
self.assertEqual(c.input_charset, 'windows-1252')
self.assertEqual(c.header_encoding, charset.QP)
self.assertEqual(c.body_encoding, charset.QP)
self.assertEqual(c.output_charset, 'windows-1252')
self.assertEqual(c.input_codec, 'windows-1252')
self.assertEqual(c.output_codec, 'windows-1252')

c = Charset('eucjp')
self.assertEqual(c.input_charset, 'euc-jp')
self.assertEqual(c.header_encoding, charset.BASE64)
self.assertEqual(c.body_encoding, None)
self.assertEqual(c.output_charset, 'iso-2022-jp')
self.assertEqual(c.input_codec, 'euc-jp')
self.assertEqual(c.output_codec, 'iso-2022-jp')

c = Charset('cp949')
self.assertEqual(c.input_charset, 'ks_c_5601-1987')
self.assertEqual(c.header_encoding, charset.SHORTEST)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'ks_c_5601-1987')
self.assertEqual(c.input_codec, 'ks_c_5601-1987')
self.assertEqual(c.output_codec, 'ks_c_5601-1987')

c = Charset('gb2312')
self.assertEqual(c.input_charset, 'gb2312')
self.assertEqual(c.header_encoding, charset.BASE64)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'gb2312')
self.assertEqual(c.input_codec, 'gb2312')
self.assertEqual(c.output_codec, 'gb2312')

c = Charset('big5')
self.assertEqual(c.input_charset, 'big5')
self.assertEqual(c.header_encoding, charset.BASE64)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'big5')
self.assertEqual(c.input_codec, 'big5')
self.assertEqual(c.output_codec, 'big5')

def test_user_charsets(self):
from email import charset
c = Charset('fake0')
self.assertEqual(c.input_charset, 'fake0')
self.assertEqual(c.header_encoding, charset.SHORTEST)
self.assertEqual(c.body_encoding, charset.BASE64)
self.assertEqual(c.output_charset, 'fake0')
self.assertEqual(c.input_codec, 'fake0')
self.assertEqual(c.output_codec, 'fake0')

charset.add_alias('fake1', 'mime-fake')
charset.add_codec('mime-fake', 'fakecodec')
charset.add_codec('output-mime-fake', 'outputfakecodec')
charset.add_charset('mime-fake', charset.QP, None, 'output-mime-fake')

c = Charset('fake1')
self.assertEqual(c.input_charset, 'mime-fake')
self.assertEqual(c.header_encoding, charset.QP)
self.assertEqual(c.body_encoding, None)
self.assertEqual(c.output_charset, 'output-mime-fake')
self.assertEqual(c.input_codec, 'fakecodec')
self.assertEqual(c.output_codec, 'outputfakecodec')

def test_codec_encodeable(self):
eq = self.assertEqual
# Make sure us-ascii = no Unicode conversion
Expand Down Expand Up @@ -5010,6 +5131,11 @@ def test_unicode_charset_name(self):
self.assertEqual(str(charset), 'us-ascii')
self.assertRaises(errors.CharsetError, Charset, 'asc\xffii')

def test_bytes_charset_name(self):
charset = Charset(b'us-ascii')
self.assertEqual(str(charset), 'us-ascii')
self.assertRaises(errors.CharsetError, Charset, b'asc\xffii')



# Test multilingual MIME headers.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
The :mod:`email` package now supports all aliases of Python codecs and uses
MIME/IANA names for all IANA registered charsets.
Loading