diff --git a/benchmark/buffers/buffer-tostring-utf8-latin1.js b/benchmark/buffers/buffer-tostring-utf8-latin1.js new file mode 100644 index 00000000000000..2996da9d307676 --- /dev/null +++ b/benchmark/buffers/buffer-tostring-utf8-latin1.js @@ -0,0 +1,53 @@ +'use strict'; + +const common = require('../common.js'); + +const bench = common.createBenchmark(main, { + size: [64, 1024, 16384, 262144, 4194304], + content: ['ascii', 'latin1', 'utf8_mixed', 'latin1_then_cjk'], + n: [1e4], +}); + +function buildBuffer(kind, size) { + if (kind === 'ascii') { + return Buffer.alloc(size, 0x61); + } + if (kind === 'latin1') { + const pair = Buffer.from([0xC3, 0xA9]); + const buf = Buffer.alloc(size); + for (let i = 0; i + 2 <= size; i += 2) pair.copy(buf, i); + return buf; + } + if (kind === 'utf8_mixed') { + const cjk = Buffer.from([0xE4, 0xB8, 0xAD]); + const buf = Buffer.alloc(size); + let i = 0; + while (i + 4 <= size) { + buf[i++] = 0x61; + cjk.copy(buf, i); + i += 3; + } + return buf; + } + if (kind === 'latin1_then_cjk') { + const pair = Buffer.from([0xC3, 0xA9]); + const cjk = Buffer.from([0xE4, 0xB8, 0xAD]); + const buf = Buffer.alloc(size); + const mid = (size >> 1) & ~1; + for (let i = 0; i + 2 <= mid; i += 2) pair.copy(buf, i); + cjk.copy(buf, mid); + for (let i = mid + 3; i + 2 <= size; i += 2) pair.copy(buf, i); + return buf; + } + throw new Error('unknown content: ' + kind); +} + +function main({ n, size, content }) { + const buf = buildBuffer(content, size); + + bench.start(); + for (let i = 0; i < n; i++) { + buf.toString('utf8'); + } + bench.end(n); +} diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 865302bfd1b4de..e8e4123dbd5b11 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -582,21 +582,86 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, return ExternOneByteString::NewFromCopy(isolate, buf, buflen); } - if (buflen >= 32 && simdutf::validate_utf8(buf, buflen)) { - // We know that we are non-ASCII (and are unlikely Latin1), use 2-byte - // In the most likely case of valid UTF-8, we can use this fast impl - // For very short input, it is slower, so we limit min size - size_t u16size = simdutf::utf16_length_from_utf8(buf, buflen); - if (u16size > static_cast(v8::String::kMaxLength)) { - isolate->ThrowException(ERR_STRING_TOO_LONG(isolate)); - return MaybeLocal(); + // Latin1-fits fast path: one-byte V8 string, half the heap of UTF-16. + // Capped at 1 MiB (above that the prescan cost erases the win). + constexpr size_t kLatin1Max = 1u << 20; + if (buflen >= 256 && buflen <= kLatin1Max) { + // Skip the allocation when any byte >= 0xC4 (UTF-8 lead for a + // codepoint > U+FF). Inner loop has no early exit so clang + // vectorizes it. + constexpr size_t kChunk = 64; + bool maybe_latin1 = true; + size_t i = 0; + for (; i + kChunk <= buflen; i += kChunk) { + uint8_t acc = 0; + for (size_t j = 0; j < kChunk; j++) { + acc |= static_cast(buf[i + j]) >= 0xC4 ? 1 : 0; + } + if (acc) { + maybe_latin1 = false; + break; + } + } + if (maybe_latin1) { + for (; i < buflen; i++) { + if (static_cast(buf[i]) >= 0xC4) { + maybe_latin1 = false; + break; + } + } + } + if (maybe_latin1) { + MaybeStackBuffer latin1; + latin1.AllocateSufficientStorage(buflen); + simdutf::result l1 = simdutf::convert_utf8_to_latin1_with_errors( + buf, buflen, latin1.out()); + if (l1.error == simdutf::error_code::SUCCESS) { + return ExternOneByteString::NewFromCopy( + isolate, latin1.out(), l1.count); + } + } + } + + if (buflen >= 32) { + // Single-pass UTF-16: over-allocate (1 char16_t per byte), then + // shrink. Above 1 MiB the exact-size 3-pass below is cheaper. + constexpr size_t kSinglePassMax = 1u << 20; + if (buflen <= kSinglePassMax) { + MaybeStackBuffer u16; + u16.AllocateSufficientStorage(buflen); + simdutf::result r = simdutf::convert_utf8_to_utf16_with_errors( + buf, buflen, reinterpret_cast(u16.out())); + if (r.error == simdutf::error_code::SUCCESS) { + if (r.count > static_cast(v8::String::kMaxLength)) { + isolate->ThrowException(ERR_STRING_TOO_LONG(isolate)); + return MaybeLocal(); + } + if (u16.IsAllocated()) { + uint16_t* data = u16.out(); + u16.Release(); + uint16_t* shrunk = static_cast( + realloc(data, r.count * sizeof(uint16_t))); + if (shrunk == nullptr) shrunk = data; + return ExternTwoByteString::New(isolate, shrunk, r.count); + } + return String::NewFromTwoByte(isolate, + u16.out(), + v8::NewStringType::kNormal, + static_cast(r.count)); + } + } else if (simdutf::validate_utf8(buf, buflen)) { + size_t u16size = simdutf::utf16_length_from_utf8(buf, buflen); + if (u16size > static_cast(v8::String::kMaxLength)) { + isolate->ThrowException(ERR_STRING_TOO_LONG(isolate)); + return MaybeLocal(); + } + return EncodeTwoByteString( + isolate, u16size, [buf, buflen, u16size](uint16_t* dst) { + size_t written = simdutf::convert_valid_utf8_to_utf16( + buf, buflen, reinterpret_cast(dst)); + CHECK_EQ(written, u16size); + }); } - return EncodeTwoByteString( - isolate, u16size, [buf, buflen, u16size](uint16_t* dst) { - size_t written = simdutf::convert_valid_utf8_to_utf16( - buf, buflen, reinterpret_cast(dst)); - CHECK_EQ(written, u16size); - }); } val =