diff --git a/sentry_sdk/serializer.py b/sentry_sdk/serializer.py index 405ea6f105..6bf6f6e70b 100644 --- a/sentry_sdk/serializer.py +++ b/sentry_sdk/serializer.py @@ -89,57 +89,56 @@ def __exit__( self._ids.pop(id(self._objs.pop()), None) -def serialize(event: "Dict[str, Any]", **kwargs: "Any") -> "Dict[str, Any]": - """ - A very smart serializer that takes a dict and emits a json-friendly dict. - Currently used for serializing the final Event and also prematurely while fetching the stack - local variables for each frame in a stacktrace. - - It works internally with 'databags' which are arbitrary data structures like Mapping, Sequence and Set. - The algorithm itself is a recursive graph walk down the data structures it encounters. - - It has the following responsibilities: - * Trimming databags and keeping them within MAX_DATABAG_BREADTH and MAX_DATABAG_DEPTH. - * Calling safe_repr() on objects appropriately to keep them informative and readable in the final payload. - * Annotating the payload with the _meta field whenever trimming happens. - - :param max_request_body_size: If set to "always", will never trim request bodies. - :param max_value_length: The max length to strip strings to, or None to disable string truncation. Defaults to None. - :param is_vars: If we're serializing vars early, we want to repr() things that are JSON-serializable to make their type more apparent. For example, it's useful to see the difference between a unicode-string and a bytestring when viewing a stacktrace. - :param custom_repr: A custom repr function that runs before safe_repr on the object to be serialized. If it returns None or throws internally, we will fallback to safe_repr. - - """ - memo = Memo() - path: "List[Segment]" = [] - meta_stack: "List[Dict[str, Any]]" = [] - - keep_request_bodies: bool = kwargs.pop("max_request_body_size", None) == "always" - max_value_length: "Optional[int]" = kwargs.pop("max_value_length", None) - is_vars = kwargs.pop("is_vars", False) - custom_repr: "Callable[..., Optional[str]]" = kwargs.pop("custom_repr", None) - - def _safe_repr_wrapper(value: "Any") -> str: +class _Serializer: + """Holds the state of a single serialize() call.""" + + __slots__ = ( + "memo", + "path", + "meta_stack", + "keep_request_bodies", + "max_value_length", + "is_vars", + "custom_repr", + ) + + def __init__( + self, + keep_request_bodies: bool, + max_value_length: "Optional[int]", + is_vars: bool, + custom_repr: "Optional[Callable[..., Optional[str]]]", + ) -> None: + self.memo = Memo() + self.path: "List[Segment]" = [] + self.meta_stack: "List[Dict[str, Any]]" = [] + self.keep_request_bodies = keep_request_bodies + self.max_value_length = max_value_length + self.is_vars = is_vars + self.custom_repr = custom_repr + + def _safe_repr_wrapper(self, value: "Any") -> str: try: repr_value = None - if custom_repr is not None: - repr_value = custom_repr(value) + if self.custom_repr is not None: + repr_value = self.custom_repr(value) return repr_value or safe_repr(value) except Exception: return safe_repr(value) - def _annotate(**meta: "Any") -> None: - while len(meta_stack) <= len(path): + def _annotate(self, **meta: "Any") -> None: + while len(self.meta_stack) <= len(self.path): try: - segment = path[len(meta_stack) - 1] - node = meta_stack[-1].setdefault(str(segment), {}) + segment = self.path[len(self.meta_stack) - 1] + node = self.meta_stack[-1].setdefault(str(segment), {}) except IndexError: node = {} - meta_stack.append(node) + self.meta_stack.append(node) - meta_stack[-1].setdefault("", {}).update(meta) + self.meta_stack[-1].setdefault("", {}).update(meta) - def _is_databag() -> "Optional[bool]": + def _is_databag(self) -> "Optional[bool]": """ A databag is any value that we need to trim. True for stuff like vars, request bodies, breadcrumbs and extra. @@ -147,16 +146,16 @@ def _is_databag() -> "Optional[bool]": :returns: `True` for "yes", `False` for :"no", `None` for "maybe soon". """ try: - if is_vars: + if self.is_vars: return True - is_request_body = _is_request_body() + is_request_body = self._is_request_body() if is_request_body in (True, None): return is_request_body - p0 = path[0] - if p0 == "breadcrumbs" and path[1] == "values": - path[2] + p0 = self.path[0] + if p0 == "breadcrumbs" and self.path[1] == "values": + self.path[2] return True if p0 == "extra": @@ -167,18 +166,18 @@ def _is_databag() -> "Optional[bool]": return False - def _is_span_attribute() -> "Optional[bool]": + def _is_span_attribute(self) -> "Optional[bool]": try: - if path[0] == "spans" and path[2] == "data": + if self.path[0] == "spans" and self.path[2] == "data": return True except IndexError: return None return False - def _is_request_body() -> "Optional[bool]": + def _is_request_body(self) -> "Optional[bool]": try: - if path[0] == "request" and path[1] == "data": + if self.path[0] == "request" and self.path[1] == "data": return True except IndexError: return None @@ -186,6 +185,7 @@ def _is_request_body() -> "Optional[bool]": return False def _serialize_node( + self, obj: "Any", is_databag: "Optional[bool]" = None, is_request_body: "Optional[bool]" = None, @@ -195,14 +195,14 @@ def _serialize_node( remaining_depth: "Optional[Union[int, float]]" = None, ) -> "Any": if segment is not None: - path.append(segment) + self.path.append(segment) try: - with memo.memoize(obj) as result: + with self.memo.memoize(obj) as result: if result: return CYCLE_MARKER - return _serialize_node_impl( + return self._serialize_node_impl( obj, is_databag=is_databag, is_request_body=is_request_body, @@ -219,16 +219,17 @@ def _serialize_node( return None finally: if segment is not None: - path.pop() - del meta_stack[len(path) + 1 :] + self.path.pop() + del self.meta_stack[len(self.path) + 1 :] - def _flatten_annotated(obj: "Any") -> "Any": + def _flatten_annotated(self, obj: "Any") -> "Any": if isinstance(obj, AnnotatedValue): - _annotate(**obj.metadata) + self._annotate(**obj.metadata) obj = obj.value return obj def _serialize_node_impl( + self, obj: "Any", is_databag: "Optional[bool]", is_request_body: "Optional[bool]", @@ -239,16 +240,16 @@ def _serialize_node_impl( if isinstance(obj, AnnotatedValue): should_repr_strings = False if should_repr_strings is None: - should_repr_strings = is_vars + should_repr_strings = self.is_vars if is_databag is None: - is_databag = _is_databag() + is_databag = self._is_databag() if is_request_body is None: - is_request_body = _is_request_body() + is_request_body = self._is_request_body() if is_databag: - if is_request_body and keep_request_bodies: + if is_request_body and self.keep_request_bodies: remaining_depth = float("inf") remaining_breadth = float("inf") else: @@ -257,23 +258,25 @@ def _serialize_node_impl( if remaining_breadth is None: remaining_breadth = MAX_DATABAG_BREADTH - obj = _flatten_annotated(obj) + obj = self._flatten_annotated(obj) if remaining_depth is not None and remaining_depth <= 0: - _annotate(rem=[["!limit", "x"]]) + self._annotate(rem=[["!limit", "x"]]) if is_databag: - return _flatten_annotated( - strip_string(_safe_repr_wrapper(obj), max_length=max_value_length) + return self._flatten_annotated( + strip_string( + self._safe_repr_wrapper(obj), max_length=self.max_value_length + ) ) return None - is_span_attribute = _is_span_attribute() + is_span_attribute = self._is_span_attribute() if (is_databag or is_span_attribute) and global_repr_processors: - hints = {"memo": memo, "remaining_depth": remaining_depth} + hints = {"memo": self.memo, "remaining_depth": remaining_depth} for processor in global_repr_processors: result = processor(obj, hints) if result is not NotImplemented: - return _flatten_annotated(result) + return self._flatten_annotated(result) sentry_repr = getattr(type(obj), "__sentry_repr__", None) @@ -281,7 +284,7 @@ def _serialize_node_impl( if should_repr_strings or ( isinstance(obj, float) and (math.isinf(obj) or math.isnan(obj)) ): - return _safe_repr_wrapper(obj) + return self._safe_repr_wrapper(obj) else: return obj @@ -292,7 +295,7 @@ def _serialize_node_impl( return ( str(format_timestamp(obj)) if not should_repr_strings - else _safe_repr_wrapper(obj) + else self._safe_repr_wrapper(obj) ) elif isinstance(obj, Mapping): @@ -305,11 +308,11 @@ def _serialize_node_impl( for k, v in obj.items(): if remaining_breadth is not None and i >= remaining_breadth: - _annotate(len=len(obj)) + self._annotate(len=len(obj)) break str_k = str(k) - v = _serialize_node( + v = self._serialize_node( v, segment=str_k, should_repr_strings=should_repr_strings, @@ -332,11 +335,11 @@ def _serialize_node_impl( for i, v in enumerate(obj): # type: ignore[arg-type] if remaining_breadth is not None and i >= remaining_breadth: - _annotate(len=len(obj)) # type: ignore[arg-type] + self._annotate(len=len(obj)) # type: ignore[arg-type] break rv_list.append( - _serialize_node( + self._serialize_node( v, segment=i, should_repr_strings=should_repr_strings, @@ -352,30 +355,63 @@ def _serialize_node_impl( return rv_list if should_repr_strings: - obj = _safe_repr_wrapper(obj) + obj = self._safe_repr_wrapper(obj) else: if isinstance(obj, bytes) or isinstance(obj, bytearray): obj = obj.decode("utf-8", "replace") if not isinstance(obj, str): - obj = _safe_repr_wrapper(obj) + obj = self._safe_repr_wrapper(obj) is_span_description = ( - len(path) == 3 and path[0] == "spans" and path[-1] == "description" + len(self.path) == 3 + and self.path[0] == "spans" + and self.path[-1] == "description" ) if is_span_description: return obj - return _flatten_annotated(strip_string(obj, max_length=max_value_length)) + return self._flatten_annotated( + strip_string(obj, max_length=self.max_value_length) + ) + + +def serialize(event: "Dict[str, Any]", **kwargs: "Any") -> "Dict[str, Any]": + """ + A very smart serializer that takes a dict and emits a json-friendly dict. + Currently used for serializing the final Event and also prematurely while fetching the stack + local variables for each frame in a stacktrace. + + It works internally with 'databags' which are arbitrary data structures like Mapping, Sequence and Set. + The algorithm itself is a recursive graph walk down the data structures it encounters. + + It has the following responsibilities: + * Trimming databags and keeping them within MAX_DATABAG_BREADTH and MAX_DATABAG_DEPTH. + * Calling safe_repr() on objects appropriately to keep them informative and readable in the final payload. + * Annotating the payload with the _meta field whenever trimming happens. + + :param max_request_body_size: If set to "always", will never trim request bodies. + :param max_value_length: The max length to strip strings to, or None to disable string truncation. Defaults to None. + :param is_vars: If we're serializing vars early, we want to repr() things that are JSON-serializable to make their type more apparent. For example, it's useful to see the difference between a unicode-string and a bytestring when viewing a stacktrace. + :param custom_repr: A custom repr function that runs before safe_repr on the object to be serialized. If it returns None or throws internally, we will fallback to safe_repr. + + """ + serializer = _Serializer( + keep_request_bodies=kwargs.pop("max_request_body_size", None) == "always", + max_value_length=kwargs.pop("max_value_length", None), + is_vars=kwargs.pop("is_vars", False), + custom_repr=kwargs.pop("custom_repr", None), + ) - # - # Start of serialize() function - # disable_capture_event.set(True) try: - serialized_event = _serialize_node(event, **kwargs) - if not is_vars and meta_stack and isinstance(serialized_event, dict): - serialized_event["_meta"] = meta_stack[0] + serialized_event = serializer._serialize_node(event, **kwargs) + if ( + not serializer.is_vars + and serializer.meta_stack + and isinstance(serialized_event, dict) + ): + serialized_event["_meta"] = serializer.meta_stack[0] return serialized_event finally: diff --git a/tests/test_serializer.py b/tests/test_serializer.py index f1483aba8d..0c5255c994 100644 --- a/tests/test_serializer.py +++ b/tests/test_serializer.py @@ -1,3 +1,4 @@ +import gc import re from array import array @@ -183,6 +184,29 @@ def test_max_value_length(body_normalizer): assert len(result["key"]) == max_value_length +def test_serialize_does_not_leave_cyclic_garbage(): + # Applications running with the GC disabled rely on serialize() being + # freed by reference counting alone, so it must not create reference + # cycles. + gc_was_enabled = gc.isenabled() + old_debug_flags = gc.get_debug() + gc.collect() + gc.disable() + try: + serialize({"extra": {"foo": [{"bar": i} for i in range(20)]}}) + serialize({"foo": "bar"}, is_vars=True) + + gc.set_debug(gc.DEBUG_SAVEALL) + gc.collect() + assert gc.garbage == [] + finally: + gc.set_debug(old_debug_flags) + gc.garbage.clear() + gc.collect() + if gc_was_enabled: + gc.enable() + + def test_serialize_local_vars(): # This was added to make sure we don't try to iterate over instances of # custom classes with an __iter__ method due to potential side effects