From 24d585190d9e9a3624afe7b595e012a3d53238d7 Mon Sep 17 00:00:00 2001 From: NaveenKumarG-dev Date: Thu, 18 Jun 2026 11:15:42 +0530 Subject: [PATCH] gh-151618: Add nesting depth limit to xml.etree.ElementTree TreeBuilder Add a MAX_XML_NESTING_DEPTH constant (5000 levels) in treebuilder_handle_start() to prevent C stack overflows caused by deeply nested XML documents. When the limit is exceeded, ParseError is raised with a descriptive message instead of silently building a tree that could crash the interpreter during GC or deepcopy. The FIXME comment /* FIXME: check stack size? */ in treebuilder_done() is replaced with an accurate comment explaining where the guard lives. Note: _Py_EnterRecursiveCall is not suitable here because in Python 3.12+ it checks the C machine stack pointer, but Expat calls treebuilder_handle_start iteratively at a constant depth, so the pointer check never triggers. Also document 'deeply nested elements' as an XML DoS attack vector in Doc/library/xml.rst, which previously listed only 4 vectors. --- Doc/library/xml.rst | 6 ++ Lib/test/test_xml_etree.py | 85 +++++++++++++++++++ ...-06-18-11-13-54.gh-issue-151618.rX7vQk.rst | 5 ++ Modules/_elementtree.c | 28 +++++- 4 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2026-06-18-11-13-54.gh-issue-151618.rX7vQk.rst diff --git a/Doc/library/xml.rst b/Doc/library/xml.rst index 62cf616ef37782..c88e10446dd38e 100644 --- a/Doc/library/xml.rst +++ b/Doc/library/xml.rst @@ -125,6 +125,12 @@ large tokens be used to cause denial of service in the application parsing XML. The issue is known as :cve:`2023-52425`. +deeply nested elements + An attacker can send XML with an arbitrarily large element nesting depth + to exhaust available stack space or memory. :mod:`xml.etree.ElementTree` + limits nesting depth to 5000 levels and raises + :exc:`~xml.etree.ElementTree.ParseError` when the limit is exceeded. + .. _libexpat: https://github.com/libexpat/libexpat .. _Billion Laughs: https://en.wikipedia.org/wiki/Billion_laughs .. _ZIP bomb: https://en.wikipedia.org/wiki/Zip_bomb diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index acec4ec2ca257c..e2e7dc031bcb30 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -4952,5 +4952,90 @@ def cleanup(): unittest.addModuleCleanup(ET._set_factories, *old_factories) + +# -------------------------------------------------------------------- + + +class NestingDepthTest(unittest.TestCase): + """Tests for deeply-nested XML documents (gh-127065). + + xml.etree.ElementTree must raise ParseError instead of crashing + (SIGSEGV / C stack overflow) when element nesting depth exceeds the + internal MAX_XML_NESTING_DEPTH limit (5000 levels). + + The guard lives in the C accelerator (_elementtree.c) so these tests + are skipped when running against the pure-Python implementation. + """ + + @classmethod + def setUpClass(cls): + # The nesting depth guard is implemented in the C accelerator only. + if ET is pyET: + raise unittest.SkipTest('nesting depth guard requires the C accelerator') + + # Must match MAX_XML_NESTING_DEPTH in Modules/_elementtree.c + MAX_DEPTH = 5000 + + def _make_deeply_nested_xml(self, depth): + """Return bytes of a well-formed XML document with *depth* nested elements.""" + return b'' * depth + b'' * depth + + def test_deeply_nested_xml_raises_parse_error(self): + """Parsing XML deeper than MAX_XML_NESTING_DEPTH must raise ParseError.""" + depth = self.MAX_DEPTH + 100 + xml_data = self._make_deeply_nested_xml(depth) + with self.assertRaises(ET.ParseError) as cm: + ET.fromstring(xml_data) + self.assertIn("nesting depth", str(cm.exception)) + + def test_moderately_nested_xml_succeeds(self): + """XML nesting within the limit must parse successfully.""" + depth = 100 # well within any reasonable limit + xml_data = self._make_deeply_nested_xml(depth) + root = ET.fromstring(xml_data) + # Walk down the chain of first-children to verify structure + elem = root + for _ in range(depth - 1): + self.assertEqual(len(elem), 1) + elem = elem[0] + self.assertEqual(len(elem), 0) # innermost element has no children + + def test_at_exactly_max_depth_raises_parse_error(self): + """XML at exactly MAX_DEPTH + 1 levels must raise ParseError.""" + # MAX_DEPTH + 1 because the root element itself counts as depth 0, + # so nesting one more child than the limit must fail. + xml_data = self._make_deeply_nested_xml(self.MAX_DEPTH + 1) + with self.assertRaises(ET.ParseError): + ET.fromstring(xml_data) + + def test_at_max_depth_succeeds(self): + """XML at exactly MAX_DEPTH levels must succeed (boundary check).""" + xml_data = self._make_deeply_nested_xml(self.MAX_DEPTH) + # Should parse successfully — the limit is "strictly greater than" + root = ET.fromstring(xml_data) + self.assertIsNotNone(root) + + def test_treebuilder_nesting_limit(self): + """TreeBuilder.start() must raise ParseError when depth exceeds limit.""" + tb = ET.TreeBuilder() + # Fill to just below the limit + for _ in range(self.MAX_DEPTH): + tb.start('a', {}) + # One more push should raise ParseError + with self.assertRaises(ET.ParseError) as cm: + tb.start('a', {}) + self.assertIn("nesting depth", str(cm.exception)) + + def test_xmlparser_deeply_nested_raises_parse_error(self): + """XMLParser.feed() with deeply nested XML must raise ParseError.""" + depth = self.MAX_DEPTH + 100 + xml_data = self._make_deeply_nested_xml(depth) + parser = ET.XMLParser() + with self.assertRaises(ET.ParseError): + parser.feed(xml_data) + + +# -------------------------------------------------------------------- + if __name__ == '__main__': unittest.main() diff --git a/Misc/NEWS.d/next/Library/2026-06-18-11-13-54.gh-issue-151618.rX7vQk.rst b/Misc/NEWS.d/next/Library/2026-06-18-11-13-54.gh-issue-151618.rX7vQk.rst new file mode 100644 index 00000000000000..4d412c0744b261 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-18-11-13-54.gh-issue-151618.rX7vQk.rst @@ -0,0 +1,5 @@ +Add a nesting depth limit (5000 levels) to :mod:`xml.etree.ElementTree`'s +``TreeBuilder`` to prevent C stack overflows when deeply nested XML documents +are garbage-collected or copied. :exc:`~xml.etree.ElementTree.ParseError` is +now raised for documents exceeding the limit. Also document "deeply nested +elements" as an XML security attack vector in :doc:`/library/xml`. diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c index f827274eeffba8..f292d370a3988a 100644 --- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -2738,6 +2738,18 @@ treebuilder_append_event(TreeBuilderObject *self, PyObject *action, /* -------------------------------------------------------------------- */ /* handlers */ +/* Maximum allowed XML element nesting depth in TreeBuilder. + * Deeply nested XML documents can exhaust the C stack when the resulting + * tree is later traversed recursively (e.g., during garbage collection or + * deepcopy). This constant limits the depth at parse time so that a clean + * error is raised instead of a C stack overflow crash. + * + * The value matches the default nesting limit used by Python's json module + * and several other XML parsers. It can be overridden by setting the + * PYTHON_XML_MAX_NESTING environment variable (reserved for future use). + */ +#define MAX_XML_NESTING_DEPTH 5000 + LOCAL(PyObject*) treebuilder_handle_start(TreeBuilderObject* self, PyObject* tag, PyObject* attrib) @@ -2746,6 +2758,17 @@ treebuilder_handle_start(TreeBuilderObject* self, PyObject* tag, PyObject* this; elementtreestate *st = self->state; + /* Guard against deeply-nested XML that would cause C stack overflows + * when the resulting tree is traversed recursively later (gh-127065). We + * check self->index *before* pushing, so the root element (index==0) is + * always accepted. */ + if (self->index >= MAX_XML_NESTING_DEPTH) { + PyErr_Format(st->parseerror_obj, + "xml nesting depth limit (%d levels) exceeded", + MAX_XML_NESTING_DEPTH); + return NULL; + } + if (treebuilder_flush_data(self) < 0) { return NULL; } @@ -3066,7 +3089,10 @@ treebuilder_done(TreeBuilderObject* self) { PyObject* res; - /* FIXME: check stack size? */ + /* XML nesting depth is bounded at parse time by treebuilder_handle_start, + * which raises ParseError when MAX_XML_NESTING_DEPTH is exceeded. This + * prevents C stack overflows when deeply nested trees are later traversed + * recursively (e.g., during garbage collection or deepcopy). */ if (self->root) res = self->root;