Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 118 additions & 82 deletions sentry_sdk/serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,74 +89,73 @@ def __exit__(
self._ids.pop(id(self._objs.pop()), None)


def serialize(event: "Dict[str, Any]", **kwargs: "Any") -> "Dict[str, Any]":
"""
A very smart serializer that takes a dict and emits a json-friendly dict.
Currently used for serializing the final Event and also prematurely while fetching the stack
local variables for each frame in a stacktrace.

It works internally with 'databags' which are arbitrary data structures like Mapping, Sequence and Set.
The algorithm itself is a recursive graph walk down the data structures it encounters.

It has the following responsibilities:
* Trimming databags and keeping them within MAX_DATABAG_BREADTH and MAX_DATABAG_DEPTH.
* Calling safe_repr() on objects appropriately to keep them informative and readable in the final payload.
* Annotating the payload with the _meta field whenever trimming happens.

:param max_request_body_size: If set to "always", will never trim request bodies.
:param max_value_length: The max length to strip strings to, or None to disable string truncation. Defaults to None.
:param is_vars: If we're serializing vars early, we want to repr() things that are JSON-serializable to make their type more apparent. For example, it's useful to see the difference between a unicode-string and a bytestring when viewing a stacktrace.
:param custom_repr: A custom repr function that runs before safe_repr on the object to be serialized. If it returns None or throws internally, we will fallback to safe_repr.

"""
memo = Memo()
path: "List[Segment]" = []
meta_stack: "List[Dict[str, Any]]" = []

keep_request_bodies: bool = kwargs.pop("max_request_body_size", None) == "always"
max_value_length: "Optional[int]" = kwargs.pop("max_value_length", None)
is_vars = kwargs.pop("is_vars", False)
custom_repr: "Callable[..., Optional[str]]" = kwargs.pop("custom_repr", None)

def _safe_repr_wrapper(value: "Any") -> str:
class _Serializer:
"""Holds the state of a single serialize() call."""

__slots__ = (
"memo",
"path",
"meta_stack",
"keep_request_bodies",
"max_value_length",
"is_vars",
"custom_repr",
)

def __init__(
self,
keep_request_bodies: bool,
max_value_length: "Optional[int]",
is_vars: bool,
custom_repr: "Optional[Callable[..., Optional[str]]]",
) -> None:
self.memo = Memo()
self.path: "List[Segment]" = []
self.meta_stack: "List[Dict[str, Any]]" = []
self.keep_request_bodies = keep_request_bodies
self.max_value_length = max_value_length
self.is_vars = is_vars
self.custom_repr = custom_repr

def _safe_repr_wrapper(self, value: "Any") -> str:
try:
repr_value = None
if custom_repr is not None:
repr_value = custom_repr(value)
if self.custom_repr is not None:
repr_value = self.custom_repr(value)
return repr_value or safe_repr(value)
except Exception:
return safe_repr(value)

def _annotate(**meta: "Any") -> None:
while len(meta_stack) <= len(path):
def _annotate(self, **meta: "Any") -> None:
while len(self.meta_stack) <= len(self.path):
try:
segment = path[len(meta_stack) - 1]
node = meta_stack[-1].setdefault(str(segment), {})
segment = self.path[len(self.meta_stack) - 1]
node = self.meta_stack[-1].setdefault(str(segment), {})
except IndexError:
node = {}

meta_stack.append(node)
self.meta_stack.append(node)

meta_stack[-1].setdefault("", {}).update(meta)
self.meta_stack[-1].setdefault("", {}).update(meta)

def _is_databag() -> "Optional[bool]":
def _is_databag(self) -> "Optional[bool]":
"""
A databag is any value that we need to trim.
True for stuff like vars, request bodies, breadcrumbs and extra.

:returns: `True` for "yes", `False` for :"no", `None` for "maybe soon".
"""
try:
if is_vars:
if self.is_vars:
return True

is_request_body = _is_request_body()
is_request_body = self._is_request_body()
if is_request_body in (True, None):
return is_request_body

p0 = path[0]
if p0 == "breadcrumbs" and path[1] == "values":
path[2]
p0 = self.path[0]
if p0 == "breadcrumbs" and self.path[1] == "values":
self.path[2]
return True

if p0 == "extra":
Expand All @@ -167,25 +166,26 @@ def _is_databag() -> "Optional[bool]":

return False

def _is_span_attribute() -> "Optional[bool]":
def _is_span_attribute(self) -> "Optional[bool]":
try:
if path[0] == "spans" and path[2] == "data":
if self.path[0] == "spans" and self.path[2] == "data":
return True
except IndexError:
return None

return False

def _is_request_body() -> "Optional[bool]":
def _is_request_body(self) -> "Optional[bool]":
try:
if path[0] == "request" and path[1] == "data":
if self.path[0] == "request" and self.path[1] == "data":
return True
except IndexError:
return None

return False

def _serialize_node(
self,
obj: "Any",
is_databag: "Optional[bool]" = None,
is_request_body: "Optional[bool]" = None,
Expand All @@ -195,14 +195,14 @@ def _serialize_node(
remaining_depth: "Optional[Union[int, float]]" = None,
) -> "Any":
if segment is not None:
path.append(segment)
self.path.append(segment)

try:
with memo.memoize(obj) as result:
with self.memo.memoize(obj) as result:
if result:
return CYCLE_MARKER

return _serialize_node_impl(
return self._serialize_node_impl(
obj,
is_databag=is_databag,
is_request_body=is_request_body,
Expand All @@ -219,16 +219,17 @@ def _serialize_node(
return None
finally:
if segment is not None:
path.pop()
del meta_stack[len(path) + 1 :]
self.path.pop()
del self.meta_stack[len(self.path) + 1 :]

def _flatten_annotated(obj: "Any") -> "Any":
def _flatten_annotated(self, obj: "Any") -> "Any":
if isinstance(obj, AnnotatedValue):
_annotate(**obj.metadata)
self._annotate(**obj.metadata)
obj = obj.value
return obj

def _serialize_node_impl(
self,
obj: "Any",
is_databag: "Optional[bool]",
is_request_body: "Optional[bool]",
Expand All @@ -239,16 +240,16 @@ def _serialize_node_impl(
if isinstance(obj, AnnotatedValue):
should_repr_strings = False
if should_repr_strings is None:
should_repr_strings = is_vars
should_repr_strings = self.is_vars

if is_databag is None:
is_databag = _is_databag()
is_databag = self._is_databag()

if is_request_body is None:
is_request_body = _is_request_body()
is_request_body = self._is_request_body()

if is_databag:
if is_request_body and keep_request_bodies:
if is_request_body and self.keep_request_bodies:
remaining_depth = float("inf")
remaining_breadth = float("inf")
else:
Expand All @@ -257,31 +258,33 @@ def _serialize_node_impl(
if remaining_breadth is None:
remaining_breadth = MAX_DATABAG_BREADTH

obj = _flatten_annotated(obj)
obj = self._flatten_annotated(obj)

if remaining_depth is not None and remaining_depth <= 0:
_annotate(rem=[["!limit", "x"]])
self._annotate(rem=[["!limit", "x"]])
if is_databag:
return _flatten_annotated(
strip_string(_safe_repr_wrapper(obj), max_length=max_value_length)
return self._flatten_annotated(
strip_string(
self._safe_repr_wrapper(obj), max_length=self.max_value_length
)
)
return None

is_span_attribute = _is_span_attribute()
is_span_attribute = self._is_span_attribute()
if (is_databag or is_span_attribute) and global_repr_processors:
hints = {"memo": memo, "remaining_depth": remaining_depth}
hints = {"memo": self.memo, "remaining_depth": remaining_depth}
for processor in global_repr_processors:
result = processor(obj, hints)
if result is not NotImplemented:
return _flatten_annotated(result)
return self._flatten_annotated(result)

sentry_repr = getattr(type(obj), "__sentry_repr__", None)

if obj is None or isinstance(obj, (bool, int, float)):
if should_repr_strings or (
isinstance(obj, float) and (math.isinf(obj) or math.isnan(obj))
):
return _safe_repr_wrapper(obj)
return self._safe_repr_wrapper(obj)
else:
return obj

Expand All @@ -292,7 +295,7 @@ def _serialize_node_impl(
return (
str(format_timestamp(obj))
if not should_repr_strings
else _safe_repr_wrapper(obj)
else self._safe_repr_wrapper(obj)
)

elif isinstance(obj, Mapping):
Expand All @@ -305,11 +308,11 @@ def _serialize_node_impl(

for k, v in obj.items():
if remaining_breadth is not None and i >= remaining_breadth:
_annotate(len=len(obj))
self._annotate(len=len(obj))
break

str_k = str(k)
v = _serialize_node(
v = self._serialize_node(
v,
segment=str_k,
should_repr_strings=should_repr_strings,
Expand All @@ -332,11 +335,11 @@ def _serialize_node_impl(

for i, v in enumerate(obj): # type: ignore[arg-type]
if remaining_breadth is not None and i >= remaining_breadth:
_annotate(len=len(obj)) # type: ignore[arg-type]
self._annotate(len=len(obj)) # type: ignore[arg-type]
break

rv_list.append(
_serialize_node(
self._serialize_node(
v,
segment=i,
should_repr_strings=should_repr_strings,
Expand All @@ -352,30 +355,63 @@ def _serialize_node_impl(
return rv_list

if should_repr_strings:
obj = _safe_repr_wrapper(obj)
obj = self._safe_repr_wrapper(obj)
else:
if isinstance(obj, bytes) or isinstance(obj, bytearray):
obj = obj.decode("utf-8", "replace")

if not isinstance(obj, str):
obj = _safe_repr_wrapper(obj)
obj = self._safe_repr_wrapper(obj)

is_span_description = (
len(path) == 3 and path[0] == "spans" and path[-1] == "description"
len(self.path) == 3
and self.path[0] == "spans"
and self.path[-1] == "description"
)
if is_span_description:
return obj

return _flatten_annotated(strip_string(obj, max_length=max_value_length))
return self._flatten_annotated(
strip_string(obj, max_length=self.max_value_length)
)


def serialize(event: "Dict[str, Any]", **kwargs: "Any") -> "Dict[str, Any]":
"""
A very smart serializer that takes a dict and emits a json-friendly dict.
Currently used for serializing the final Event and also prematurely while fetching the stack
local variables for each frame in a stacktrace.

It works internally with 'databags' which are arbitrary data structures like Mapping, Sequence and Set.
The algorithm itself is a recursive graph walk down the data structures it encounters.

It has the following responsibilities:
* Trimming databags and keeping them within MAX_DATABAG_BREADTH and MAX_DATABAG_DEPTH.
* Calling safe_repr() on objects appropriately to keep them informative and readable in the final payload.
* Annotating the payload with the _meta field whenever trimming happens.

:param max_request_body_size: If set to "always", will never trim request bodies.
:param max_value_length: The max length to strip strings to, or None to disable string truncation. Defaults to None.
:param is_vars: If we're serializing vars early, we want to repr() things that are JSON-serializable to make their type more apparent. For example, it's useful to see the difference between a unicode-string and a bytestring when viewing a stacktrace.
:param custom_repr: A custom repr function that runs before safe_repr on the object to be serialized. If it returns None or throws internally, we will fallback to safe_repr.

"""
serializer = _Serializer(
keep_request_bodies=kwargs.pop("max_request_body_size", None) == "always",
max_value_length=kwargs.pop("max_value_length", None),
is_vars=kwargs.pop("is_vars", False),
custom_repr=kwargs.pop("custom_repr", None),
)

#
# Start of serialize() function
#
disable_capture_event.set(True)
try:
serialized_event = _serialize_node(event, **kwargs)
if not is_vars and meta_stack and isinstance(serialized_event, dict):
serialized_event["_meta"] = meta_stack[0]
serialized_event = serializer._serialize_node(event, **kwargs)
if (
not serializer.is_vars
and serializer.meta_stack
and isinstance(serialized_event, dict)
):
serialized_event["_meta"] = serializer.meta_stack[0]

return serialized_event
finally:
Expand Down
24 changes: 24 additions & 0 deletions tests/test_serializer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import gc
import re
from array import array

Expand Down Expand Up @@ -183,6 +184,29 @@ def test_max_value_length(body_normalizer):
assert len(result["key"]) == max_value_length


def test_serialize_does_not_leave_cyclic_garbage():
# Applications running with the GC disabled rely on serialize() being
# freed by reference counting alone, so it must not create reference
# cycles.
gc_was_enabled = gc.isenabled()
old_debug_flags = gc.get_debug()
gc.collect()
gc.disable()
try:
serialize({"extra": {"foo": [{"bar": i} for i in range(20)]}})
serialize({"foo": "bar"}, is_vars=True)

gc.set_debug(gc.DEBUG_SAVEALL)
gc.collect()
assert gc.garbage == []
finally:
gc.set_debug(old_debug_flags)
gc.garbage.clear()
gc.collect()
if gc_was_enabled:
gc.enable()


def test_serialize_local_vars():
# This was added to make sure we don't try to iterate over instances of
# custom classes with an __iter__ method due to potential side effects
Expand Down
Loading