From 4386e3a713b5f3c52487e6adaa49269ac39f810d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 17 May 2026 22:37:07 +0300 Subject: [PATCH] gh-88726: Stop using non-standard charset names eucgb2312_cn and big5_tw in email --- Lib/email/charset.py | 2 - Lib/test/test_email/test_asian_codecs.py | 56 +++++++++++++++++++ ...6-05-17-22-37-02.gh-issue-88726.BAoL6j.rst | 2 + 3 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-05-17-22-37-02.gh-issue-88726.BAoL6j.rst diff --git a/Lib/email/charset.py b/Lib/email/charset.py index 5036c3f58a5633..c4b246455f86c6 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -93,8 +93,6 @@ # Map charsets to their Unicode codec strings. CODEC_MAP = { - 'gb2312': 'eucgb2312_cn', - 'big5': 'big5_tw', # Hack: We don't want *any* conversion for stuff marked us-ascii, as all # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. # Let that stuff pass through without conversion to/from Unicode. diff --git a/Lib/test/test_email/test_asian_codecs.py b/Lib/test/test_email/test_asian_codecs.py index ca44f54c69b39b..85979ffd8169a7 100644 --- a/Lib/test/test_email/test_asian_codecs.py +++ b/Lib/test/test_email/test_asian_codecs.py @@ -58,6 +58,62 @@ def test_japanese_codecs(self): # TK: full decode comparison eq(str(h).encode(jcode), subject_bytes) + h = Header("Japanese") + s = '\u65e5\u672c\u8a9e' # 日本語 + h.append(s, Charset('euc-jp')) + h.append(s, Charset('iso-2022-jp')) + h.append(s, Charset('shift_jis')) + eq(h.encode(), """\ +Japanese =?iso-2022-jp?b?GyRCRnxLXDhsGyhC?= =?iso-2022-jp?b?GyRCRnxLXDhsGyhC?= + =?iso-2022-jp?b?GyRCRnxLXDhsGyhC?=""") + eq(decode_header(h.encode()), + [(b'Japanese ', None), + (b'\x1b$BF|K\\8l\x1b(B\x1b$BF|K\\8l\x1b(B\x1b$BF|K\\8l\x1b(B', 'iso-2022-jp'), + ]) + + def test_chinese_codecs(self): + eq = self.ndiffAssertEqual + h = Header("Chinese") + s = '\u4e2d\u6587' # 中文 + h.append(s, Charset('gb2312')) + h.append(s, Charset('gbk')) + h.append(s, Charset('gb18030')) + h.append(s, Charset('hz')) + h.append(s, Charset('big5')) + h.append(s, Charset('big5hkscs')) + eq(h.encode(), """\ +Chinese =?gb2312?b?1tDOxA==?= =?gbk?b?1tDOxA==?= =?gb18030?b?1tDOxA==?= + =?hz?b?fntWUE5Efn0=?= =?big5?b?pKSk5Q==?= =?big5hkscs?b?pKSk5Q==?=""") + eq(decode_header(h.encode()), + [(b'Chinese ', None), + (b'\xd6\xd0\xce\xc4', 'gb2312'), + (b'\xd6\xd0\xce\xc4', 'gbk'), + (b'\xd6\xd0\xce\xc4', 'gb18030'), + (b'~{VPND~}', 'hz'), + (b'\xa4\xa4\xa4\xe5', 'big5'), + (b'\xa4\xa4\xa4\xe5', 'big5hkscs'), + ]) + + def test_korean_codecs(self): + eq = self.ndiffAssertEqual + h = Header("Korean") + s = '\ud55c\uad6d\uc5b4' # 한국어 + h.append(s, Charset('euc-kr')) + h.append(s, Charset('ks_c_5601-1987')) + h.append(s, Charset('cp949')) + h.append(s, Charset('iso-2022-kr')) + h.append(s, Charset('johab')) + eq(h.encode(), """\ +Korean =?euc-kr?b?x9Gxub7u?= =?ks_c_5601-1987?b?x9Gxub7uIMfRsbm+7g==?= + =?iso-2022-kr?b?GyQpQw5HUTE5Pm4P?= =?johab?b?0GWKgrTh?=""") + eq(decode_header(h.encode()), + [(b'Korean ', None), + (b'\xc7\xd1\xb1\xb9\xbe\xee', 'euc-kr'), + (b'\xc7\xd1\xb1\xb9\xbe\xee \xc7\xd1\xb1\xb9\xbe\xee', 'ks_c_5601-1987'), + (b'\x1b$)C\x0eGQ19>n\x0f', 'iso-2022-kr'), + (b'\xd0e\x8a\x82\xb4\xe1', 'johab'), + ]) + def test_payload_encoding_utf8(self): jhello = str(b'\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc' b'\xa5\xeb\xa5\xc9\xa1\xaa', 'euc-jp') diff --git a/Misc/NEWS.d/next/Library/2026-05-17-22-37-02.gh-issue-88726.BAoL6j.rst b/Misc/NEWS.d/next/Library/2026-05-17-22-37-02.gh-issue-88726.BAoL6j.rst new file mode 100644 index 00000000000000..ba9058d79c9873 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-17-22-37-02.gh-issue-88726.BAoL6j.rst @@ -0,0 +1,2 @@ +The :mod:`email` package now uses standard MIME charset names "gb2312" and +"big5" instead of non-standard names "eucgb2312_cn" and "big5_tw".