From 1bf03e1f594b76132543b8ceb6652cb6b069e7dc Mon Sep 17 00:00:00 2001 From: Masataka Pocke Kuwabara Date: Mon, 22 Jun 2026 16:28:00 +0900 Subject: [PATCH] Preserve UTF-8 encoding when reallocating a frozen ResumableParser buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `JSON::ResumableParser#<<` raises `Encoding::CompatibilityError` when the first chunk is a frozen multibyte UTF-8 string and more data is fed after a partial parse. ## Reproduction ```ruby require "json" parser = JSON::ResumableParser.new parser << '{"message":"日本'.freeze # frozen + multibyte (UTF-8) parser.parse # => false (incomplete) parser << '語のつづき"}'.freeze # => Encoding::CompatibilityError: BINARY and UTF-8 ``` The error only surfaces when the first chunk is both frozen and multibyte. A frozen ASCII-only first chunk stays ASCII-compatible and a mutable first chunk keeps its encoding, so neither triggers it. This is common in streaming use cases where chunks arrive as frozen UTF-8 (e.g. gRPC/protobuf string fields), and is easily hit with multibyte (e.g. Japanese) payloads. ## Root cause In `cResumableParser_feed`, the first feed adopts a frozen input string directly as the buffer (`parser->buffer = str`), keeping its UTF-8 encoding. When a later feed needs to reallocate that frozen buffer, it allocates the new buffer with `rb_str_buf_new()`, which returns an ASCII-8BIT (BINARY) string and does not carry over the original UTF-8 encoding. The subsequent `rb_str_append(parser->buffer, str)` then appends a multibyte UTF-8 chunk to a BINARY buffer that already holds non-ASCII bytes, and the two encodings are incompatible. `convert_encoding` always normalizes the input to UTF-8, so the buffer is guaranteed to be UTF-8. Associating the freshly allocated buffer with UTF-8 restores that invariant. This mirrors the existing idiom used for the generator buffer, and also fixes the latent case where a frozen ASCII-only first chunk would silently leave the buffer as BINARY. ## Test Added a regression test feeding frozen multibyte chunks across a partial parse. The byte-by-byte `assert_resumed_parsing` helper cannot cover this: it feeds single ASCII-8BIT bytes via `byte.chr`, which `convert_encoding` turns into a mutable dup, so the buffer never becomes frozen and the reallocation path is never taken. The `.freeze` calls are kept explicit so the test still exercises the frozen path even if the file's `frozen_string_literal` magic comment is ever removed. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01PpeQSEFkF1X1Uuzjx9BsYe --- ext/json/ext/parser/parser.c | 1 + test/json/resumable_parser_test.rb | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index c774c791..9a86cd58 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -2388,6 +2388,7 @@ static VALUE cResumableParser_feed(VALUE self, VALUE str) if (RB_OBJ_FROZEN_RAW(parser->buffer)) { VALUE new_buffer = rb_obj_hide(rb_str_buf_new(remaining + RSTRING_LEN(str))); + rb_enc_associate_index(new_buffer, utf8_encindex); char *old_ptr = RSTRING_PTR(parser->buffer); memcpy(RSTRING_PTR(new_buffer), old_ptr + consumed, remaining); diff --git a/test/json/resumable_parser_test.rb b/test/json/resumable_parser_test.rb index e93b2314..4cc6c085 100644 --- a/test/json/resumable_parser_test.rb +++ b/test/json/resumable_parser_test.rb @@ -205,6 +205,16 @@ def test_rest assert_equal '"unterminated string', @parser.rest end + def test_feed_frozen_multibyte_chunks + @parser << '{"message":"日本'.freeze + refute @parser.parse + @parser << '語のつづき"}'.freeze + assert @parser.parse + value = @parser.value + assert_equal({ "message" => "日本語のつづき" }, value) + assert_equal Encoding::UTF_8, value["message"].encoding + end + def test_eos assert_predicate @parser, :eos?