The remote debugging frame cache uses last_profiled_frame as an address anchor, so it can hit an ABA case when a frame address is reused after the old frame has returned. A profiler sample can then validate against the same address but append cached parent frames from a previous stack, producing impossible mixed stacks such as a b_leaf frame with an a_parent caller. PR #151437 fixes missing anchor updates, but the cache should also distinguish reused frame addresses, for example by pairing the anchor with a monotonic sequence, so cached continuations are only reused for the exact frame instance previously sampled.
I confirmed the following standalone reproducer. It repeatedly samples a process that alternates between two separate call chains. Seeing names from both chains in one sampled stack is impossible unless cached parent frames were spliced onto the wrong live frame.
import contextlib
import os
import socket
import subprocess
import sys
import tempfile
import textwrap
from _remote_debugging import PROCESS_VM_READV_SUPPORTED, RemoteUnwinder
TRANSIENT_ERRORS = (OSError, RuntimeError, UnicodeDecodeError)
if sys.platform != "linux" or not PROCESS_VM_READV_SUPPORTED:
raise SystemExit("requires Linux with process_vm_readv support")
def find_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.bind(("localhost", 0))
return sock.getsockname()[1]
def wait_for(sock, expected):
sock.settimeout(10.0)
data = b""
while expected not in data:
chunk = sock.recv(4096)
if not chunk:
raise RuntimeError(f"target exited before {expected!r}")
data += chunk
port = find_free_port()
target = f"""\
import socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect(("localhost", {port}))
sock.sendall(b"ready")
def burn_a():
total = 0
for i in range(20000):
total += i
return total
def burn_b():
total = 0
for i in range(20000):
total += i
return total
def a_leaf():
return burn_a()
def b_leaf():
return burn_b()
def a_parent():
return a_leaf()
def b_parent():
return b_leaf()
while True:
a_parent()
b_parent()
"""
with tempfile.TemporaryDirectory() as tmp:
script = os.path.join(tmp, "target.py")
with open(script, "w", encoding="utf-8") as f:
f.write(textwrap.dedent(target))
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.bind(("localhost", port))
server.listen(1)
server.settimeout(10.0)
proc = subprocess.Popen([sys.executable, script])
conn = None
try:
conn, _ = server.accept()
wait_for(conn, b"ready")
unwinder = RemoteUnwinder(
proc.pid, all_threads=True, cache_frames=True, stats=True
)
branch_a = {"a_parent", "a_leaf", "burn_a"}
branch_b = {"b_parent", "b_leaf", "burn_b"}
for sample in range(1, 8001):
with contextlib.suppress(*TRANSIENT_ERRORS):
traces = unwinder.get_stack_trace()
for interp in traces:
for thread in interp.threads:
funcs = [frame.funcname for frame in thread.frame_info]
names = set(funcs)
if branch_a & names and branch_b & names:
print(f"mixed stack found at sample {sample}")
print(funcs)
print(unwinder.get_stats())
raise SystemExit(1)
print("no mixed stack found")
print(unwinder.get_stats())
finally:
if conn is not None:
conn.close()
server.close()
proc.kill()
proc.wait(timeout=10.0)
On the unfixed build, this reproduced for me with this output:
mixed stack found at sample 5904
['b_leaf', 'a_parent', '<module>']
{'total_samples': 5904, 'frame_cache_hits': 5887, 'frame_cache_misses': 13, 'frame_cache_partial_hits': 3, 'frames_read_from_cache': 17666, 'frames_read_from_memory': 5942, 'memory_reads': 11889, 'memory_bytes_read': 5722200, 'code_object_cache_hits': 5947, 'code_object_cache_misses': 7, 'stale_cache_invalidations': 0, 'batched_read_attempts': 5903, 'batched_read_successes': 5903, 'batched_read_misses': 0, 'batched_read_segments_requested': 17709, 'batched_read_segments_completed': 17709, 'frame_cache_hit_rate': 99.7797729967813, 'code_object_cache_hit_rate': 99.88243197850184, 'batched_read_success_rate': 100.0, 'batched_read_segment_completion_rate': 100.0}
Linked PRs
The remote debugging frame cache uses
last_profiled_frameas an address anchor, so it can hit an ABA case when a frame address is reused after the old frame has returned. A profiler sample can then validate against the same address but append cached parent frames from a previous stack, producing impossible mixed stacks such as ab_leafframe with ana_parentcaller. PR #151437 fixes missing anchor updates, but the cache should also distinguish reused frame addresses, for example by pairing the anchor with a monotonic sequence, so cached continuations are only reused for the exact frame instance previously sampled.I confirmed the following standalone reproducer. It repeatedly samples a process that alternates between two separate call chains. Seeing names from both chains in one sampled stack is impossible unless cached parent frames were spliced onto the wrong live frame.
On the unfixed build, this reproduced for me with this output:
Linked PRs