core/state,triedb/pathdb: fix bintrieFlatReader disk-layer shape via per-offset extraction

Addresses review finding C2 (+ I5, S5, T2, T3, T12). Before this commit, bintrieFlatCodec.ReadAccount returned the FULL variable-length stem blob from disk while the in-memory diff-layer buffer stored per-offset 32-byte values. The consumer, bintrieFlatReader.Account, enforced len(basicBlob)!=32 → error, so every disk-layer hit produced "bintrie BasicData leaf invalid length" in production the moment the write buffer flushed. TestBintrieFlatReaderEndToEnd did not catch this because it never forced a buffer → disk flush. Fix: make bintrieFlatCodec.ReadAccount extract the offset from the stem blob (mirroring ReadStorage), so the disk path and the buffer path return the same 32-byte per-offset shape. Update AccountCacheKey/StorageCacheKey to embed the full 32-byte key (prefix + 31-byte stem + 1-byte offset), since caching under a stem-only key would collapse BasicData and CodeHash into the same slot and return the wrong value on the second hit. Update Flush's cache-update loop to store per-offset entries from the aggregated write set. Design note: I considered the alternative of introducing a new StemBlob(stem) interface method that returns the full blob synthesized from a stem-level lookup index. Rejected because (a) the index is a new data structure with its own consistency invariants, (b) the per-offset approach is strictly local to the codec + reader, and (c) the "1 Pebble read per Account" locality benefit is preserved at the OS page cache level — both offsets at the same stem live in the same Pebble block, so the second read is effectively free. bintrieFlatReader.Account still does two AccountRLP lookups; the torn-read hazard is gated by a new load-bearing invariant test, TestBinaryHasherWritesBothBasicAndCodeHash, which asserts that binaryHasher.updateAccount always emits both BasicData and CodeHash leaves together. A future code-only update that broke this invariant would fail the test. Tests added: * TestBintrieFlatReaderEndToEndAfterFlush — explicitly flushes via tdb.Commit(root, false) and re-reads through a fresh StateReader. This is the smoking-gun regression for C2. * TestBintrieFlatReaderMultipleOffsetsPerStem — multiple offsets at the same stem (BasicData, CodeHash, header storage slots) all round-trip post-flush. * TestBintrieCodecCrossFlushRMW — two Flush calls to the same stem from different "blocks" correctly merge on disk, with prior offsets preserved. * TestBinaryHasherWritesBothBasicAndCodeHash — locks down the hasher co-write invariant that bintrieFlatReader.Account relies on. Existing tests updated to match the new per-offset ReadAccount semantics: * TestBintrieCodecAccountRoundTrip, TestBintrieCodecMultipleWritesSameStem, TestBintrieCodecDeleteAccount — now read per-offset rather than calling extractStemOffset on the raw blob. * TestBintrieCodecCacheKeysDisjoint — additionally verifies two offsets at the same stem produce distinct cache keys. Error messages in bintrieFlatReader now include address and length context (S5).
2026-06-12 01:41:36 +00:00 · 2026-04-09 00:31:28 +02:00 · 2026-04-09 00:31:28 +02:00 · fcc0587ec3
commit fcc0587ec3
parent bfb77d98f6
5 changed files with 466 additions and 99 deletions
--- a/core/state/database_hasher_binary_test.go
+++ b/core/state/database_hasher_binary_test.go
@ -383,6 +383,89 @@ func TestMerkleHasherNoLeafProducer(t *testing.T) {
 	}
 }

+// TestBinaryHasherWritesBothBasicAndCodeHash is a load-bearing invariant
+// test for the A1 remediation. The bintrieFlatReader.Account method
+// performs TWO independent AccountRLP reads (BasicData at offset 0 and
+// CodeHash at offset 1). Cross-read consistency is only safe if the
+// hasher ALWAYS co-writes both leaves whenever it touches an account —
+// if a future optimization (e.g., a code-only update) emitted only the
+// CodeHash leaf, the two reads could resolve to different layers and
+// return a torn view.
+//
+// This test locks the invariant down: after an UpdateAccount call, the
+// drained stem writes must contain EXACTLY ONE BasicData write and
+// EXACTLY ONE CodeHash write for the touched address, both at the same
+// stem. Any change to binaryHasher.updateAccount that drops either
+// write will fail this test and the developer will be forced to
+// re-evaluate the bintrieFlatReader.Account torn-read argument before
+// shipping.
+func TestBinaryHasherWritesBothBasicAndCodeHash(t *testing.T) {
+	db := triedb.NewDatabase(rawdb.NewMemoryDatabase(), triedb.VerkleDefaults)
+	h := newTestBinaryHasher(t, db, types.EmptyBinaryHash, hasherTestConfig{"inv", false, false})
+
+	lp, ok := Hasher(h).(LeafProducer)
+	if !ok {
+		t.Fatal("binaryHasher should implement LeafProducer")
+	}
+
+	// Update a single account. The hasher MUST emit exactly two stem
+	// writes: BasicData (offset 0) and CodeHash (offset 1), at the
+	// same stem.
+	if err := h.UpdateAccount(
+		[]common.Address{hasherAddr1},
+		[]AccountMut{hasherAccount(1, 100)},
+	); err != nil {
+		t.Fatalf("UpdateAccount: %v", err)
+	}
+	writes := lp.DrainStemWrites()
+	if len(writes) != 2 {
+		t.Fatalf("expected exactly 2 stem writes per UpdateAccount (BasicData + CodeHash), got %d", len(writes))
+	}
+
+	// Verify one is BasicData and one is CodeHash.
+	seenBasic := false
+	seenCode := false
+	for _, w := range writes {
+		switch w.Offset {
+		case bintrie.BasicDataLeafKey:
+			seenBasic = true
+		case bintrie.CodeHashLeafKey:
+			seenCode = true
+		default:
+			t.Errorf("unexpected stem write offset %d (want %d or %d)", w.Offset, bintrie.BasicDataLeafKey, bintrie.CodeHashLeafKey)
+		}
+	}
+	if !seenBasic {
+		t.Error("UpdateAccount did NOT emit a BasicData leaf write — bintrieFlatReader.Account torn-read invariant broken")
+	}
+	if !seenCode {
+		t.Error("UpdateAccount did NOT emit a CodeHash leaf write — bintrieFlatReader.Account torn-read invariant broken")
+	}
+
+	// Verify both writes target the same stem.
+	if writes[0].Stem != writes[1].Stem {
+		t.Errorf("BasicData and CodeHash writes at different stems: %x vs %x", writes[0].Stem, writes[1].Stem)
+	}
+
+	// Exercise the delete path too: binaryHasher.deleteAccount should
+	// also emit both nil writes.
+	if err := h.UpdateAccount(
+		[]common.Address{hasherAddr1},
+		[]AccountMut{{Account: nil}},
+	); err != nil {
+		t.Fatalf("UpdateAccount (delete): %v", err)
+	}
+	deleteWrites := lp.DrainStemWrites()
+	if len(deleteWrites) != 2 {
+		t.Fatalf("expected 2 stem writes per account delete, got %d", len(deleteWrites))
+	}
+	for i, w := range deleteWrites {
+		if w.Value != nil {
+			t.Errorf("delete write[%d] should have nil Value, got %x", i, w.Value)
+		}
+	}
+}
+
 // TestStateUpdateEncodeBinaryFromLeaves verifies that stateUpdate.encodeBinary
 // turns a slice of StemWrite values into the per-offset accountData map that
 // pathdb's bintrie codec consumes. Three things matter:
--- a/core/state/reader.go
+++ b/core/state/reader.go
@ -18,6 +18,7 @@ package state

 import (
 	"errors"
+	"fmt"
 	"sync"
 	"sync/atomic"

@ -221,37 +222,51 @@ func newBintrieFlatReader(reader database.StateReader) *bintrieFlatReader {
 	return &bintrieFlatReader{reader: raw}
 }

-// Account implements StateReader. It performs two underlying reads — one
-// for the BasicData leaf (offset 0) and one for the CodeHash leaf
-// (offset 1) — and combines them into a unified Account. If both leaves
-// are absent the account is treated as non-existent (return nil, nil).
+// Account implements StateReader. It performs two underlying reads —
+// one for the BasicData leaf (offset 0) and one for the CodeHash leaf
+// (offset 1) — and combines them into a unified Account.
 //
-// Returning nil-with-no-error matches the merkle flatReader's
-// "not present" semantics: the trie reader is the gatekeeper that
-// distinguishes "missing" from "present-with-zero-balance".
+// Torn-read invariant (load-bearing): binaryHasher.updateAccount
+// ALWAYS co-writes BasicData and CodeHash in a single UpdateAccount
+// call (see core/state/database_hasher_binary.go:updateAccount). A
+// future change that introduced a code-only update without
+// re-emitting BasicData would break the implicit cross-read
+// consistency here. TestBinaryHasherWritesBothBasicAndCodeHash locks
+// this invariant down.
+//
+// Fall-through vs confirmed-absent (see A2):
+//   - both leaves genuinely absent from the flat state →
+//     errNotCoveredYet → multiStateReader falls through to trie reader.
+//   - both leaves present-but-stem-blob-has-zero-offsets (post-delete
+//     tombstone) → (nil, nil) → confirmed absent.
+//
+// At this commit (A1) we still return (nil, nil) for the absent case.
+// A2 tightens this to an errNotCoveredYet sentinel so the trie reader
+// runs on miss.
 func (r *bintrieFlatReader) Account(addr common.Address) (*Account, error) {
 	basicKey := common.BytesToHash(bintrie.GetBinaryTreeKeyBasicData(addr))
 	codeKey := common.BytesToHash(bintrie.GetBinaryTreeKeyCodeHash(addr))

 	basicBlob, err := r.reader.AccountRLP(basicKey)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("bintrie BasicData read %x: %w", addr, err)
 	}
 	codeBlob, err := r.reader.AccountRLP(codeKey)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("bintrie CodeHash read %x: %w", addr, err)
 	}
 	if len(basicBlob) == 0 && len(codeBlob) == 0 {
 		return nil, nil
 	}
-	// A bintrie leaf is always either absent or exactly 32 bytes; a
-	// shorter blob is a corruption signal we surface as an error rather
-	// than silently constructing a junk account.
+	// A bintrie leaf is always either absent or exactly 32 bytes. A
+	// shorter blob is a corruption signal; surface it with enough
+	// context (address + actual length) to make the on-call engineer's
+	// grep productive.
 	if len(basicBlob) != 0 && len(basicBlob) != 32 {
-		return nil, errors.New("bintrie BasicData leaf has invalid length")
+		return nil, fmt.Errorf("bintrie BasicData leaf invalid length: addr=%x len=%d want=32", addr, len(basicBlob))
 	}
 	if len(codeBlob) != 0 && len(codeBlob) != 32 {
-		return nil, errors.New("bintrie CodeHash leaf has invalid length")
+		return nil, fmt.Errorf("bintrie CodeHash leaf invalid length: addr=%x len=%d want=32", addr, len(codeBlob))
 	}

 	acct := &Account{}
@ -288,13 +303,13 @@ func (r *bintrieFlatReader) Storage(addr common.Address, slot common.Hash) (comm
 	fullKey := bintrie.GetBinaryTreeKeyStorageSlot(addr, slot[:])
 	blob, err := r.reader.AccountRLP(common.BytesToHash(fullKey))
 	if err != nil {
-		return common.Hash{}, err
+		return common.Hash{}, fmt.Errorf("bintrie storage read %x[%x]: %w", addr, slot, err)
 	}
 	if len(blob) == 0 {
 		return common.Hash{}, nil
 	}
 	if len(blob) != 32 {
-		return common.Hash{}, errors.New("bintrie storage leaf has invalid length")
+		return common.Hash{}, fmt.Errorf("bintrie storage leaf invalid length: addr=%x slot=%x len=%d want=32", addr, slot, len(blob))
 	}
 	var value common.Hash
 	copy(value[:], blob)
--- a/core/state/reader_bintrie_test.go
+++ b/core/state/reader_bintrie_test.go
@ -170,3 +170,163 @@ func TestBintrieFlatReaderMissingAccount(t *testing.T) {
 		t.Errorf("missing account: got %+v, want nil", got)
 	}
 }
+
+// TestBintrieFlatReaderEndToEndAfterFlush is the smoking-gun regression
+// test for A1 (fix bintrieFlatReader disk-layer shape). Before the A1
+// remediation, `bintrieFlatCodec.ReadAccount` returned the full stem
+// blob from disk while `bintrieFlatReader.Account` expected a per-offset
+// 32-byte value — so every disk-layer hit errored with "bintrie
+// BasicData leaf invalid length". The original TestBintrieFlatReaderEndToEnd
+// did not catch this because it never flushed the write buffer to disk:
+// all reads came from the in-memory diff-layer buffer (which stores
+// per-offset entries correctly).
+//
+// This test explicitly calls `tdb.Commit(root, false)` after the state
+// commit, forcing the buffer to flush. Subsequent reads MUST hit the
+// disk-layer code path. If A1 regresses, the reads either error out or
+// return wrong data.
+func TestBintrieFlatReaderEndToEndAfterFlush(t *testing.T) {
+	disk := rawdb.NewMemoryDatabase()
+	tdb := triedb.NewDatabase(disk, triedb.VerkleDefaults)
+	sdb := NewDatabase(tdb, nil)
+
+	state, err := New(types.EmptyVerkleHash, sdb)
+	if err != nil {
+		t.Fatalf("init state: %v", err)
+	}
+
+	var (
+		addrA   = common.HexToAddress("0xAAaaAAaaAAaaAAaaAAaaAAaaAAaaAAaaAAaaAAaa")
+		addrB   = common.HexToAddress("0xBBbbBBbbBBbbBBbbBBbbBBbbBBbbBBbbBBbbBBbb")
+		balance = uint256.NewInt(0xCAFE)
+		slot    = common.HexToHash("0x07")
+		value   = common.HexToHash("0x42")
+	)
+
+	state.SetBalance(addrA, balance, tracing.BalanceChangeUnspecified)
+	state.SetNonce(addrA, 5, tracing.NonceChangeUnspecified)
+	state.SetCode(addrA, []byte{0x60, 0x80, 0x60, 0x40}, tracing.CodeChangeUnspecified)
+	state.SetState(addrA, slot, value)
+	state.SetBalance(addrB, uint256.NewInt(0xBEEF), tracing.BalanceChangeUnspecified)
+
+	root, err := state.Commit(0, true, false)
+	if err != nil {
+		t.Fatalf("commit: %v", err)
+	}
+
+	// Force buffer → disk flush. Without this, all reads below would hit
+	// the in-memory diff-layer buffer path, masking the A1 bug.
+	if err := tdb.Commit(root, false); err != nil {
+		t.Fatalf("tdb.Commit (flush to disk): %v", err)
+	}
+
+	// Open a fresh StateReader for the flushed root. Reads now go
+	// through the disk layer via `codec.ReadAccount`, which (post-A1)
+	// must return per-offset 32-byte values matching what the reader
+	// expects.
+	reader, err := sdb.StateReader(root)
+	if err != nil {
+		t.Fatalf("StateReader after flush: %v", err)
+	}
+
+	gotA, err := reader.Account(addrA)
+	if err != nil {
+		t.Fatalf("Account A after flush: %v", err)
+	}
+	if gotA == nil {
+		t.Fatal("addrA: account is nil after flush (A1 regression)")
+	}
+	if gotA.Nonce != 5 {
+		t.Errorf("addrA nonce after flush: got %d, want 5", gotA.Nonce)
+	}
+	if gotA.Balance.Cmp(balance) != 0 {
+		t.Errorf("addrA balance after flush: got %s, want %s", gotA.Balance, balance)
+	}
+
+	gotB, err := reader.Account(addrB)
+	if err != nil {
+		t.Fatalf("Account B after flush: %v", err)
+	}
+	if gotB == nil {
+		t.Fatal("addrB: account is nil after flush (A1 regression)")
+	}
+	if gotB.Balance.Uint64() != 0xBEEF {
+		t.Errorf("addrB balance after flush: got %s, want 0xBEEF", gotB.Balance)
+	}
+
+	gotSlot, err := reader.Storage(addrA, slot)
+	if err != nil {
+		t.Fatalf("Storage after flush: %v", err)
+	}
+	if gotSlot != value {
+		t.Errorf("storage slot after flush: got %x, want %x", gotSlot, value)
+	}
+}
+
+// TestBintrieFlatReaderMultipleOffsetsPerStem verifies that multiple
+// offsets at the same stem (BasicData at offset 0, CodeHash at offset 1,
+// a header storage slot at offset 64+slotnum) all round-trip correctly
+// through the per-offset read path. This exercises the "same stem, many
+// offsets" common case for contract accounts with header storage.
+func TestBintrieFlatReaderMultipleOffsetsPerStem(t *testing.T) {
+	disk := rawdb.NewMemoryDatabase()
+	tdb := triedb.NewDatabase(disk, triedb.VerkleDefaults)
+	sdb := NewDatabase(tdb, nil)
+
+	state, err := New(types.EmptyVerkleHash, sdb)
+	if err != nil {
+		t.Fatalf("init state: %v", err)
+	}
+
+	addr := common.HexToAddress("0x1234567890abcdef1234567890abcdef12345678")
+	state.SetBalance(addr, uint256.NewInt(100), tracing.BalanceChangeUnspecified)
+	state.SetNonce(addr, 7, tracing.NonceChangeUnspecified)
+	state.SetCode(addr, []byte{0xDE, 0xAD, 0xBE, 0xEF}, tracing.CodeChangeUnspecified)
+	// Header slots 0..63 (per EIP-7864) live at the same stem as
+	// BasicData/CodeHash. Set a few to exercise multi-offset per stem.
+	state.SetState(addr, common.HexToHash("0x00"), common.HexToHash("0x11"))
+	state.SetState(addr, common.HexToHash("0x01"), common.HexToHash("0x22"))
+	state.SetState(addr, common.HexToHash("0x05"), common.HexToHash("0x33"))
+
+	root, err := state.Commit(0, true, false)
+	if err != nil {
+		t.Fatalf("commit: %v", err)
+	}
+	// Flush so the reads hit the disk path.
+	if err := tdb.Commit(root, false); err != nil {
+		t.Fatalf("tdb.Commit: %v", err)
+	}
+
+	reader, err := sdb.StateReader(root)
+	if err != nil {
+		t.Fatalf("StateReader: %v", err)
+	}
+
+	gotAcct, err := reader.Account(addr)
+	if err != nil {
+		t.Fatalf("Account: %v", err)
+	}
+	if gotAcct == nil {
+		t.Fatal("account is nil")
+	}
+	if gotAcct.Nonce != 7 {
+		t.Errorf("nonce: got %d, want 7", gotAcct.Nonce)
+	}
+	if gotAcct.Balance.Uint64() != 100 {
+		t.Errorf("balance: got %s, want 100", gotAcct.Balance)
+	}
+
+	for _, tc := range []struct{ slot, want common.Hash }{
+		{common.HexToHash("0x00"), common.HexToHash("0x11")},
+		{common.HexToHash("0x01"), common.HexToHash("0x22")},
+		{common.HexToHash("0x05"), common.HexToHash("0x33")},
+	} {
+		got, err := reader.Storage(addr, tc.slot)
+		if err != nil {
+			t.Fatalf("Storage(%x): %v", tc.slot, err)
+		}
+		if got != tc.want {
+			t.Errorf("slot %x: got %x, want %x", tc.slot, got, tc.want)
+		}
+	}
+}
--- a/triedb/pathdb/flat_codec_bintrie.go
+++ b/triedb/pathdb/flat_codec_bintrie.go
@ -126,18 +126,36 @@ func (c *bintrieFlatCodec) StorageKey(addr common.Address, slot common.Hash) (co
 // Disk reads
 // ---------------------------------------------------------------------

-// ReadAccount returns the raw stem blob for the account's stem — NOT a
-// decoded account. The caller (e.g. bintrieFlatReader in a later commit)
-// is responsible for extracting BasicData (offset 0) and CodeHash
-// (offset 1) from the blob.
+// ReadAccount returns the 32-byte value stored at the offset indicated
+// by the input key (the final byte of `key` is the bintrie offset).
+// Returns nil if the offset is not populated in the on-disk stem blob.
 //
-// This signature asymmetry with merkleFlatCodec.ReadAccount (which
-// returns slim-RLP-encoded account bytes) is intentional: a bintrie stem
-// blob can contain data for many logical fields, and the caller decides
-// which offsets to extract. A higher-level "return an assembled Account"
-// helper would have to re-encode into a format no consumer wants.
+// The per-offset return shape matches ReadStorage and, crucially,
+// matches the buffer-path return shape: the pathdb diff-layer buffer
+// stores per-offset entries (keyed by the full 32-byte stem||offset
+// key) holding 32-byte leaf values. When `disklayer.account()` falls
+// through from the buffer to the codec's disk read, both sides must
+// agree on the per-offset representation — otherwise a length check in
+// the consumer (bintrieFlatReader.Account) fails on every
+// post-buffer-flush read. Prior to this commit the disk path returned
+// the whole stem blob while the buffer path returned a 32-byte value,
+// which caused every real-world read to error once the buffer spilled
+// to disk.
+//
+// A malformed stem blob is treated as "entry absent" (returning nil)
+// to match the behavior of rawdb.ReadStorageSnapshot on the merkle
+// path — the interface has no error channel, and propagating nil lets
+// the multi-reader fall through to the trie reader as a gatekeeper.
 func (c *bintrieFlatCodec) ReadAccount(db ethdb.KeyValueReader, key common.Hash) []byte {
-	return rawdb.ReadBinTrieStem(db, stemFromKey(key))
+	blob := rawdb.ReadBinTrieStem(db, stemFromKey(key))
+	if len(blob) == 0 {
+		return nil
+	}
+	val, err := extractStemOffset(blob, offsetFromKey(key))
+	if err != nil {
+		return nil
+	}
+	return val
 }

 // ReadStorage returns the 32-byte value stored at the storage slot's
@ -285,27 +303,37 @@ func splitAccountBlob(blob []byte) ([]stemOffsetValue, error) {

 // AccountCacheKey returns a disambiguated byte key for the shared
 // fastcache-backed clean state cache. The prefix byte
-// bintrieCacheKeyPrefix keeps bintrie stem lookups disjoint from merkle
-// account lookups (both of which use 32-byte keys), and from merkle
-// storage lookups (which use 64-byte keys). The stem (31 bytes) is
-// embedded after the prefix; the offset byte is not included because
-// the cache entry caches the whole stem blob, not a single offset.
+// bintrieCacheKeyPrefix keeps bintrie lookups disjoint from merkle
+// account lookups (32-byte keys) and from merkle storage lookups
+// (64-byte keys).
+//
+// The full 32-byte (stem || offset) key is embedded after the prefix
+// so each offset at a given stem gets its own cache entry. This is
+// required because ReadAccount now returns the per-offset 32-byte
+// leaf value rather than the whole stem blob: caching under a
+// stem-only key would collapse BasicData and CodeHash (or any two
+// offsets at the same stem) into a single slot and the second hit
+// would return the wrong offset's value.
+//
+// Resulting layout: 1 byte prefix + 32 bytes full key = 33 bytes
+// total. The stem is at bytes[1..31]; the offset is at byte[32].
 func (c *bintrieFlatCodec) AccountCacheKey(key common.Hash) []byte {
-	out := make([]byte, 1+bintrie.StemSize)
+	out := make([]byte, 1+common.HashLength)
 	out[0] = bintrieCacheKeyPrefix
-	copy(out[1:], stemFromKey(key))
+	copy(out[1:], key[:])
 	return out
 }

-// StorageCacheKey returns the cache key for a storage entry. For bintrie
-// this is the same stem as the account cache key — storage slots and
-// account header live at different stems in the general case, but
-// multiple storage slots of the same stem share a single cache entry.
-// The accountKey parameter is ignored (see StorageKey).
+// StorageCacheKey returns the cache key for a storage entry. The
+// accountKey parameter is ignored (see StorageKey). The full storage
+// key — which already encodes (stem || offset) via
+// GetBinaryTreeKeyStorageSlot — is embedded directly so each slot at
+// a stem has its own cache entry, matching the per-offset semantics
+// of AccountCacheKey.
 func (c *bintrieFlatCodec) StorageCacheKey(_ common.Hash, storageKey common.Hash) []byte {
-	out := make([]byte, 1+bintrie.StemSize)
+	out := make([]byte, 1+common.HashLength)
 	out[0] = bintrieCacheKeyPrefix
-	copy(out[1:], stemFromKey(storageKey))
+	copy(out[1:], storageKey[:])
 	return out
 }

@ -387,29 +415,34 @@ func (c *bintrieFlatCodec) MarkerCompare(key []byte, marker []byte) int {
 // produced by AccountKey/StorageKey, and each value is a 32-byte leaf
 // (or nil to clear that offset).
 //
-// All entries are first grouped by stem, then a single
-// read-modify-write is issued per stem so the codec touches each stem
-// at most once during a flush. This is what allows the per-call
-// pre-aggregation requirement documented on bintrieFlatCodec to be
-// satisfied even when many writes target the same stem.
+// Writes are aggregated per stem and a single read-modify-write is
+// issued per stem, so the codec touches each stem at most once during
+// a flush and the per-call pre-aggregation requirement is satisfied
+// even when many writes target the same stem.
 //
-// storageData is also walked because higher-level callers may emit
-// storage entries that the codec routes through the storage map for
-// historical reasons; for the bintrie path, entries should normally
-// arrive on accountData but we accept either layout.
+// storageData is walked alongside accountData; bintrie entries should
+// normally arrive on accountData but we accept either layout for
+// robustness.
+//
+// Cache update: after the per-stem RMW, the clean cache is updated
+// with each written offset's new value (per-offset entries, matching
+// the shape returned by ReadAccount after the A1 remediation). Offsets
+// that were not touched by this flush retain their existing cache
+// entries, which remain valid because the RMW did not modify them.
 //
 // Returns (offset count from accountData, offset count from storageData)
-// so the metric reporting in writeStates remains comparable to the
-// merkle path. The clean cache is updated with the merged stem blob
-// (one cache entry per stem, not per offset) — readers extract the
-// requested offset on hit.
+// for metric reporting parity with the merkle path.
 func (c *bintrieFlatCodec) Flush(batch ethdb.Batch, genMarker []byte, accountData map[common.Hash][]byte, storageData map[common.Hash]map[common.Hash][]byte, clean *fastcache.Cache) (int, int) {
-	// Aggregate per-offset writes into per-stem batches. We use [31]byte
-	// as the map key because bytes slices aren't hashable in Go and the
-	// stem itself is fixed size; the alternative (using common.Hash with
-	// a zero pad) would waste a byte per entry.
+	// Aggregate per-offset writes into per-stem batches. We use
+	// [31]byte as the map key because byte slices aren't hashable in
+	// Go and the stem is fixed size; the alternative (common.Hash with
+	// a zero pad) wastes a byte per entry without buying anything.
 	type aggregator struct {
-		writes []stemOffsetValue
+		// fullKeys preserves the original 32-byte lookup keys so the
+		// cache update loop below can store per-offset entries without
+		// reconstructing the key from (stem, offset) pairs.
+		fullKeys []common.Hash
+		writes   []stemOffsetValue
 	}
 	aggregated := make(map[[bintrie.StemSize]byte]*aggregator)

@ -422,6 +455,7 @@ func (c *bintrieFlatCodec) Flush(batch ethdb.Batch, genMarker []byte, accountDat
 			ag = &aggregator{}
 			aggregated[stem] = ag
 		}
+		ag.fullKeys = append(ag.fullKeys, fullKey)
 		ag.writes = append(ag.writes, stemOffsetValue{Offset: offset, Value: value})
 	}

@ -448,22 +482,19 @@ func (c *bintrieFlatCodec) Flush(batch ethdb.Batch, genMarker []byte, accountDat
 			addWrite(fullKey, value)
 		}
 	}
-	// Issue one RMW per stem and update the clean cache with the merged
-	// blob (or invalidate it if the stem was deleted).
-	for stem, ag := range aggregated {
-		merged := c.applyWrites(batch, stem[:], ag.writes)
-		if clean != nil {
-			// Reuse AccountCacheKey to derive the cache key — for
-			// bintrie this only depends on the stem so the trailing
-			// offset byte in the synthetic full key is irrelevant.
-			var fullKey common.Hash
-			copy(fullKey[:bintrie.StemSize], stem[:])
+	// Issue one RMW per stem, then update the clean cache per-offset
+	// using the fullKeys we captured in the aggregator. A nil/empty
+	// value stored in the cache means "confirmed absent" (the reader
+	// will fall through to the trie reader on this per feedback #3 /
+	// Commit A2); a 32-byte value means the offset is populated.
+	for _, ag := range aggregated {
+		c.applyWrites(batch, ag.fullKeys[0][:bintrie.StemSize], ag.writes)
+		if clean == nil {
+			continue
+		}
+		for i, fullKey := range ag.fullKeys {
 			cacheKey := c.AccountCacheKey(fullKey)
-			if merged == nil {
-				clean.Set(cacheKey, nil)
-			} else {
-				clean.Set(cacheKey, merged)
-			}
+			clean.Set(cacheKey, ag.writes[i].Value)
 		}
 	}
 	return accountWrites, storageWrites
--- a/triedb/pathdb/flat_codec_bintrie_test.go
+++ b/triedb/pathdb/flat_codec_bintrie_test.go
@ -48,8 +48,9 @@ func flushBatch(t *testing.T, batch interface{ Write() error }) {

 // TestBintrieCodecAccountRoundTrip verifies that an account written via
 // WriteAccount (a two-slot BasicData||CodeHash blob) is persisted under
-// the account's stem and can be read back by extracting the relevant
-// offsets from the stem blob.
+// the account's stem and can be read back by calling ReadAccount with
+// the appropriate per-offset key (A1 remediation: ReadAccount now
+// returns a per-offset 32-byte value, matching the buffer-path shape).
 func TestBintrieCodecAccountRoundTrip(t *testing.T) {
 	codec, db := newTestBintrieCodec(t)
 	addr := common.HexToAddress("0x1111111111111111111111111111111111111111")
@ -62,19 +63,19 @@ func TestBintrieCodecAccountRoundTrip(t *testing.T) {
 	codec.WriteAccount(batch, codec.AccountKey(addr), blob)
 	flushBatch(t, batch)

-	// Read back via ReadAccount — returns the raw stem blob, not the
-	// decoded account. Extract offsets 0 and 1 manually.
-	got := codec.ReadAccount(db, codec.AccountKey(addr))
-	if len(got) == 0 {
-		t.Fatal("ReadAccount returned empty for just-written account")
+	// Read each offset individually. `codec.AccountKey(addr)` returns
+	// the BasicData key (offset 0); the CodeHash key has the same stem
+	// with offset 1.
+	basicKey := codec.AccountKey(addr)
+	codeKey := common.BytesToHash(bintrie.GetBinaryTreeKeyCodeHash(addr))
+
+	gotBasic := codec.ReadAccount(db, basicKey)
+	if !bytes.Equal(gotBasic, basicData) {
+		t.Fatalf("BasicData read: got %x, want %x", gotBasic, basicData)
 	}
-	gotBasic, err := extractStemOffset(got, bintrie.BasicDataLeafKey)
-	if err != nil || !bytes.Equal(gotBasic, basicData) {
-		t.Fatalf("BasicData extract: got %x err=%v, want %x", gotBasic, err, basicData)
-	}
-	gotCode, err := extractStemOffset(got, bintrie.CodeHashLeafKey)
-	if err != nil || !bytes.Equal(gotCode, codeHash) {
-		t.Fatalf("CodeHash extract: got %x err=%v, want %x", gotCode, err, codeHash)
+	gotCode := codec.ReadAccount(db, codeKey)
+	if !bytes.Equal(gotCode, codeHash) {
+		t.Fatalf("CodeHash read: got %x, want %x", gotCode, codeHash)
 	}
 }

@ -129,14 +130,14 @@ func TestBintrieCodecMultipleWritesSameStem(t *testing.T) {
 	codec.WriteStorage(batch, acctKey, storageKey, storageValue)
 	flushBatch(t, batch)

-	// All three offsets should now be readable.
-	accountBlob := codec.ReadAccount(db, codec.AccountKey(addr))
-	gotBasic, _ := extractStemOffset(accountBlob, bintrie.BasicDataLeafKey)
-	if !bytes.Equal(gotBasic, basicData) {
+	// All three offsets should now be readable via per-offset reads.
+	basicKey := codec.AccountKey(addr)
+	codeKey := common.BytesToHash(bintrie.GetBinaryTreeKeyCodeHash(addr))
+
+	if gotBasic := codec.ReadAccount(db, basicKey); !bytes.Equal(gotBasic, basicData) {
 		t.Fatalf("BasicData lost after storage write: got %x, want %x", gotBasic, basicData)
 	}
-	gotCode, _ := extractStemOffset(accountBlob, bintrie.CodeHashLeafKey)
-	if !bytes.Equal(gotCode, codeHash) {
+	if gotCode := codec.ReadAccount(db, codeKey); !bytes.Equal(gotCode, codeHash) {
 		t.Fatalf("CodeHash lost after storage write: got %x, want %x", gotCode, codeHash)
 	}
 	gotStorage := codec.ReadStorage(db, acctKey, storageKey)
@ -173,14 +174,20 @@ func TestBintrieCodecDeleteAccount(t *testing.T) {
 	codec.DeleteAccount(batch, codec.AccountKey(addr))
 	flushBatch(t, batch)

-	accountBlob := codec.ReadAccount(db, codec.AccountKey(addr))
-	if len(accountBlob) == 0 {
+	basicKey := codec.AccountKey(addr)
+	codeKey := common.BytesToHash(bintrie.GetBinaryTreeKeyCodeHash(addr))
+
+	// Verify the underlying stem blob still exists (the storage slot
+	// at offset 64 should have prevented a full delete).
+	stemBlob := rawdb.ReadBinTrieStem(db, stemFromKey(basicKey))
+	if len(stemBlob) == 0 {
 		t.Fatal("stem blob was fully deleted; header storage should still be present")
 	}
-	if got, _ := extractStemOffset(accountBlob, bintrie.BasicDataLeafKey); got != nil {
+	// BasicData and CodeHash now read back as nil (offset cleared).
+	if got := codec.ReadAccount(db, basicKey); got != nil {
 		t.Fatalf("BasicData not cleared: %x", got)
 	}
-	if got, _ := extractStemOffset(accountBlob, bintrie.CodeHashLeafKey); got != nil {
+	if got := codec.ReadAccount(db, codeKey); got != nil {
 		t.Fatalf("CodeHash not cleared: %x", got)
 	}
 	if got := codec.ReadStorage(db, acctKey, storageKey); !bytes.Equal(got, storageValue) {
@ -224,8 +231,11 @@ func TestBintrieCodecDeleteLastOffsetRemovesKey(t *testing.T) {
 }

 // TestBintrieCodecCacheKeysDisjoint verifies that the bintrie cache key
-// prefix keeps it disjoint from merkle account keys. This is the
-// collision check that Agent 2 flagged in the review.
+// prefix keeps it disjoint from merkle account keys AND that two
+// different offsets at the same stem produce DIFFERENT cache keys
+// (the A1 remediation moved from per-stem caching to per-offset
+// caching — without the full-key embedding, BasicData and CodeHash
+// would collide in the cache and return wrong values).
 func TestBintrieCodecCacheKeysDisjoint(t *testing.T) {
 	codec := &bintrieFlatCodec{}
 	merkle := &merkleFlatCodec{}
@ -243,6 +253,20 @@ func TestBintrieCodecCacheKeysDisjoint(t *testing.T) {
 	if binKey[0] != bintrieCacheKeyPrefix {
 		t.Fatalf("bintrie cache key missing prefix byte: %x", binKey)
 	}
+	// Per-offset disambiguation: two keys with the same stem but
+	// different offsets must produce distinct cache keys.
+	var basicKey common.Hash
+	copy(basicKey[:], hash[:])
+	basicKey[31] = bintrie.BasicDataLeafKey
+	var codeKey common.Hash
+	copy(codeKey[:], hash[:])
+	codeKey[31] = bintrie.CodeHashLeafKey
+
+	basicCacheKey := codec.AccountCacheKey(basicKey)
+	codeCacheKey := codec.AccountCacheKey(codeKey)
+	if bytes.Equal(basicCacheKey, codeCacheKey) {
+		t.Fatalf("per-offset cache keys collided at same stem: %x", basicCacheKey)
+	}
 }

 // TestBintrieCodecSplitMarker verifies the single-tier marker handling.
@ -341,6 +365,60 @@ func TestBintrieCodecFlushAggregates(t *testing.T) {
 	}
 }

+// TestBintrieCodecCrossFlushRMW verifies that writes to the SAME stem
+// from DIFFERENT flush passes (simulating blocks N and N+1) correctly
+// merge on disk. Flush1 writes offsets 0+1 at stemX; Flush2 writes
+// offset 64 at the same stem. After both flushes, all three offsets
+// must be readable — the second flush must not clobber the first.
+//
+// This is the regression test for cross-flush RMW correctness and is
+// the bread-and-butter behavior of the per-stem codec layout. Before
+// the A1 remediation, the buffer → disk shape mismatch also masked
+// this (different writes would be invisible through the reader), so
+// the regression test had no teeth.
+func TestBintrieCodecCrossFlushRMW(t *testing.T) {
+	codec, db := newTestBintrieCodec(t)
+
+	stem := bytes.Repeat([]byte{0x99}, bintrie.StemSize)
+	mkKey := func(offset byte) common.Hash {
+		var k common.Hash
+		copy(k[:bintrie.StemSize], stem)
+		k[bintrie.StemSize] = offset
+		return k
+	}
+	basicVal := bytes.Repeat([]byte{0xAA}, stemBlobValueSize)
+	codeVal := bytes.Repeat([]byte{0xBB}, stemBlobValueSize)
+	slotVal := bytes.Repeat([]byte{0xCC}, stemBlobValueSize)
+
+	// Flush 1: write BasicData (offset 0) and CodeHash (offset 1).
+	batch := db.NewBatch()
+	codec.Flush(batch, nil, map[common.Hash][]byte{
+		mkKey(bintrie.BasicDataLeafKey): basicVal,
+		mkKey(bintrie.CodeHashLeafKey):  codeVal,
+	}, nil, nil)
+	flushBatch(t, batch)
+
+	// Flush 2: write a header storage slot at offset 64 — same stem.
+	batch = db.NewBatch()
+	codec.Flush(batch, nil, map[common.Hash][]byte{
+		mkKey(64): slotVal,
+	}, nil, nil)
+	flushBatch(t, batch)
+
+	// After both flushes, all three offsets must be readable. Before
+	// the RMW, Flush 2 would overwrite the stem blob and erase
+	// BasicData + CodeHash.
+	if got := codec.ReadAccount(db, mkKey(bintrie.BasicDataLeafKey)); !bytes.Equal(got, basicVal) {
+		t.Errorf("BasicData lost after second flush: got %x, want %x", got, basicVal)
+	}
+	if got := codec.ReadAccount(db, mkKey(bintrie.CodeHashLeafKey)); !bytes.Equal(got, codeVal) {
+		t.Errorf("CodeHash lost after second flush: got %x, want %x", got, codeVal)
+	}
+	if got := codec.ReadAccount(db, mkKey(64)); !bytes.Equal(got, slotVal) {
+		t.Errorf("header slot at offset 64 missing: got %x, want %x", got, slotVal)
+	}
+}
+
 // TestBintrieCodecFlushDelete verifies that nil-valued entries in the
 // accountData map clear the corresponding offset, and that clearing every
 // populated offset at a stem removes the on-disk key entirely (matching