diff --git a/Lib/email/charset.py b/Lib/email/charset.py index 5036c3f58a5633c..a985e9152dda0fe 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -9,6 +9,7 @@ 'add_codec', ] +import codecs from functools import partial import email.base64mime @@ -61,40 +62,73 @@ 'utf-8': (SHORTEST, BASE64, 'utf-8'), } -# Aliases for other commonly-used names for character sets. Map -# them to the real ones used in email. +# Map Python codec names to their corresponding MIME/IANA names. ALIASES = { - 'latin_1': 'iso-8859-1', - 'latin-1': 'iso-8859-1', - 'latin_2': 'iso-8859-2', - 'latin-2': 'iso-8859-2', - 'latin_3': 'iso-8859-3', - 'latin-3': 'iso-8859-3', - 'latin_4': 'iso-8859-4', - 'latin-4': 'iso-8859-4', - 'latin_5': 'iso-8859-9', - 'latin-5': 'iso-8859-9', - 'latin_6': 'iso-8859-10', - 'latin-6': 'iso-8859-10', - 'latin_7': 'iso-8859-13', - 'latin-7': 'iso-8859-13', - 'latin_8': 'iso-8859-14', - 'latin-8': 'iso-8859-14', - 'latin_9': 'iso-8859-15', - 'latin-9': 'iso-8859-15', - 'latin_10':'iso-8859-16', - 'latin-10':'iso-8859-16', - 'cp949': 'ks_c_5601-1987', - 'euc_jp': 'euc-jp', - 'euc_kr': 'euc-kr', - 'ascii': 'us-ascii', - } + 'ascii': 'us-ascii', + 'big5hkscs': 'big5-hkscs', + 'cp037': 'ibm037', + 'cp1026': 'ibm1026', + 'cp1140': 'ibm01140', + 'cp1250': 'windows-1250', + 'cp1251': 'windows-1251', + 'cp1252': 'windows-1252', + 'cp1253': 'windows-1253', + 'cp1254': 'windows-1254', + 'cp1255': 'windows-1255', + 'cp1256': 'windows-1256', + 'cp1257': 'windows-1257', + 'cp1258': 'windows-1258', + 'cp273': 'ibm273', + 'cp424': 'ibm424', + 'cp437': 'ibm437', + 'cp500': 'ibm500', + 'cp775': 'ibm775', + 'cp850': 'ibm850', + 'cp852': 'ibm852', + 'cp855': 'ibm855', + 'cp857': 'ibm857', + 'cp858': 'ibm00858', + 'cp860': 'ibm860', + 'cp861': 'ibm861', + 'cp862': 'ibm862', + 'cp863': 'ibm863', + 'cp864': 'ibm864', + 'cp865': 'ibm865', + 'cp866': 'ibm866', + 'cp869': 'ibm869', + 'cp874': 'windows-874', + 'euc_jp': 'euc-jp', + 'euc_kr': 'euc-kr', + 'hz': 'hz-gb-2312', + 'iso2022_jp': 'iso-2022-jp', + 'iso2022_jp_2': 'iso-2022-jp-2', + 'iso2022_kr': 'iso-2022-kr', + 'iso8859-1': 'iso-8859-1', + 'iso8859-10': 'iso-8859-10', + 'iso8859-11': 'iso-8859-11', + 'iso8859-13': 'iso-8859-13', + 'iso8859-14': 'iso-8859-14', + 'iso8859-15': 'iso-8859-15', + 'iso8859-16': 'iso-8859-16', + 'iso8859-2': 'iso-8859-2', + 'iso8859-3': 'iso-8859-3', + 'iso8859-4': 'iso-8859-4', + 'iso8859-5': 'iso-8859-5', + 'iso8859-6': 'iso-8859-6', + 'iso8859-7': 'iso-8859-7', + 'iso8859-8': 'iso-8859-8-i', + 'iso8859-9': 'iso-8859-9', + 'kz1048': 'kz-1048', + 'mac-roman': 'macintosh', + + # CP949 is not registered in IANA. KS_C_5601-1987 is not the same, + # but the closest registered option. + 'cp949': 'ks_c_5601-1987', +} # Map charsets to their Unicode codec strings. CODEC_MAP = { - 'gb2312': 'eucgb2312_cn', - 'big5': 'big5_tw', # Hack: We don't want *any* conversion for stuff marked us-ascii, as all # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. # Let that stuff pass through without conversion to/from Unicode. @@ -217,7 +251,18 @@ def __init__(self, input_charset=DEFAULT_CHARSET): raise errors.CharsetError(input_charset) input_charset = input_charset.lower() # Set the input charset after filtering through the aliases - self.input_charset = ALIASES.get(input_charset, input_charset) + # For backward compatibility, try ALIASES first to let the user + # override it. + if input_charset in ALIASES: + input_charset = ALIASES[input_charset] + else: + try: + input_codec = codecs.lookup(input_charset).name + except LookupError: + pass + else: + input_charset = ALIASES.get(input_codec, input_codec) + self.input_charset = input_charset # We can try to guess which encoding and conversion to use by the # charset_map dictionary. Try that first, but let the user override # it. @@ -228,7 +273,7 @@ def __init__(self, input_charset=DEFAULT_CHARSET): # Set the attributes, allowing the arguments to override the default. self.header_encoding = henc self.body_encoding = benc - self.output_charset = ALIASES.get(conv, conv) + self.output_charset = conv # Now set the codecs. If one isn't defined for input_charset, # guess and try a Unicode codec with the same name as input_codec. self.input_codec = CODEC_MAP.get(self.input_charset, diff --git a/Lib/email/contentmanager.py b/Lib/email/contentmanager.py index 13fcb9787f1f320..faf2626bccce651 100644 --- a/Lib/email/contentmanager.py +++ b/Lib/email/contentmanager.py @@ -173,11 +173,11 @@ def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None, disposition=None, filename=None, cid=None, params=None, headers=None): _prepare_set(msg, 'text', subtype, headers) + + charset = email.charset.Charset(charset).input_charset cte, payload = _encode_text(string, charset, cte, msg.policy) msg.set_payload(payload) - msg.set_param('charset', - email.charset.ALIASES.get(charset, charset), - replace=True) + msg.set_param('charset', charset, replace=True) msg['Content-Transfer-Encoding'] = cte _finalize_set(msg, disposition, filename, cid, params) raw_data_manager.add_set_handler(str, set_text_content) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index d2c2261edbe04e1..5dad2d6dd5e1e0d 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -4970,6 +4970,127 @@ def tearDown(self): except KeyError: pass + def test_attributes(self): + from email import charset + c = Charset() + self.assertEqual(c.input_charset, 'us-ascii') + self.assertEqual(c.header_encoding, None) + self.assertEqual(c.body_encoding, None) + self.assertEqual(c.output_charset, 'us-ascii') + self.assertEqual(c.input_codec, None) + self.assertEqual(c.output_codec, None) + + c = Charset('us-ascii') + self.assertEqual(c.input_charset, 'us-ascii') + self.assertEqual(c.header_encoding, None) + self.assertEqual(c.body_encoding, None) + self.assertEqual(c.output_charset, 'us-ascii') + self.assertEqual(c.input_codec, None) + self.assertEqual(c.output_codec, None) + + c = Charset('utf8') + self.assertEqual(c.input_charset, 'utf-8') + self.assertEqual(c.header_encoding, charset.SHORTEST) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'utf-8') + self.assertEqual(c.input_codec, 'utf-8') + self.assertEqual(c.output_codec, 'utf-8') + + c = Charset('latin1') + self.assertEqual(c.input_charset, 'iso-8859-1') + self.assertEqual(c.header_encoding, charset.QP) + self.assertEqual(c.body_encoding, charset.QP) + self.assertEqual(c.output_charset, 'iso-8859-1') + self.assertEqual(c.input_codec, 'iso-8859-1') + self.assertEqual(c.output_codec, 'iso-8859-1') + + c = Charset('latin9') + self.assertEqual(c.input_charset, 'iso-8859-15') + self.assertEqual(c.header_encoding, charset.QP) + self.assertEqual(c.body_encoding, charset.QP) + self.assertEqual(c.output_charset, 'iso-8859-15') + self.assertEqual(c.input_codec, 'iso-8859-15') + self.assertEqual(c.output_codec, 'iso-8859-15') + + c = Charset('cyrillic') + self.assertEqual(c.input_charset, 'iso-8859-5') + self.assertEqual(c.header_encoding, charset.SHORTEST) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'iso-8859-5') + self.assertEqual(c.input_codec, 'iso-8859-5') + self.assertEqual(c.output_codec, 'iso-8859-5') + + c = Charset('cp1251') + self.assertEqual(c.input_charset, 'windows-1251') + self.assertEqual(c.header_encoding, charset.SHORTEST) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'windows-1251') + self.assertEqual(c.input_codec, 'windows-1251') + self.assertEqual(c.output_codec, 'windows-1251') + + c = Charset('cp1252') + self.assertEqual(c.input_charset, 'windows-1252') + self.assertEqual(c.header_encoding, charset.QP) + self.assertEqual(c.body_encoding, charset.QP) + self.assertEqual(c.output_charset, 'windows-1252') + self.assertEqual(c.input_codec, 'windows-1252') + self.assertEqual(c.output_codec, 'windows-1252') + + c = Charset('eucjp') + self.assertEqual(c.input_charset, 'euc-jp') + self.assertEqual(c.header_encoding, charset.BASE64) + self.assertEqual(c.body_encoding, None) + self.assertEqual(c.output_charset, 'iso-2022-jp') + self.assertEqual(c.input_codec, 'euc-jp') + self.assertEqual(c.output_codec, 'iso-2022-jp') + + c = Charset('cp949') + self.assertEqual(c.input_charset, 'ks_c_5601-1987') + self.assertEqual(c.header_encoding, charset.SHORTEST) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'ks_c_5601-1987') + self.assertEqual(c.input_codec, 'ks_c_5601-1987') + self.assertEqual(c.output_codec, 'ks_c_5601-1987') + + c = Charset('gb2312') + self.assertEqual(c.input_charset, 'gb2312') + self.assertEqual(c.header_encoding, charset.BASE64) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'gb2312') + self.assertEqual(c.input_codec, 'gb2312') + self.assertEqual(c.output_codec, 'gb2312') + + c = Charset('big5') + self.assertEqual(c.input_charset, 'big5') + self.assertEqual(c.header_encoding, charset.BASE64) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'big5') + self.assertEqual(c.input_codec, 'big5') + self.assertEqual(c.output_codec, 'big5') + + def test_user_charsets(self): + from email import charset + c = Charset('fake0') + self.assertEqual(c.input_charset, 'fake0') + self.assertEqual(c.header_encoding, charset.SHORTEST) + self.assertEqual(c.body_encoding, charset.BASE64) + self.assertEqual(c.output_charset, 'fake0') + self.assertEqual(c.input_codec, 'fake0') + self.assertEqual(c.output_codec, 'fake0') + + charset.add_alias('fake1', 'mime-fake') + charset.add_codec('mime-fake', 'fakecodec') + charset.add_codec('output-mime-fake', 'outputfakecodec') + charset.add_charset('mime-fake', charset.QP, None, 'output-mime-fake') + + c = Charset('fake1') + self.assertEqual(c.input_charset, 'mime-fake') + self.assertEqual(c.header_encoding, charset.QP) + self.assertEqual(c.body_encoding, None) + self.assertEqual(c.output_charset, 'output-mime-fake') + self.assertEqual(c.input_codec, 'fakecodec') + self.assertEqual(c.output_codec, 'outputfakecodec') + def test_codec_encodeable(self): eq = self.assertEqual # Make sure us-ascii = no Unicode conversion @@ -5010,6 +5131,11 @@ def test_unicode_charset_name(self): self.assertEqual(str(charset), 'us-ascii') self.assertRaises(errors.CharsetError, Charset, 'asc\xffii') + def test_bytes_charset_name(self): + charset = Charset(b'us-ascii') + self.assertEqual(str(charset), 'us-ascii') + self.assertRaises(errors.CharsetError, Charset, b'asc\xffii') + # Test multilingual MIME headers. diff --git a/Misc/NEWS.d/next/Library/2026-05-17-12-37-59.gh-issue-53144.c5tr1p.rst b/Misc/NEWS.d/next/Library/2026-05-17-12-37-59.gh-issue-53144.c5tr1p.rst new file mode 100644 index 000000000000000..283a5ba44d1f19f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-17-12-37-59.gh-issue-53144.c5tr1p.rst @@ -0,0 +1,2 @@ +The :mod:`email` package now supports all aliases of Python codecs and uses +MIME/IANA names for all IANA registered charsets.