How to compress whitespace in XML using Python
The following Python function will use the lxml
fast XML library to compress unneccessary whitespace in an XML string. It will not compress whitespace in text nodes, only in the XML structure itself.
import io
from lxml import etree
def compress_xml_whitespace(input_bytes: io.BytesIO) -> io.BytesIO:
"""
Compresses the whitespace in an XML file.
Args:
input_bytes (io.BytesIO): The input BytesIO object containing the XML content.
Returns:
io.BytesIO: The output BytesIO object containing the compressed XML content.
"""
input_bytes.seek(0) # Reset the position to the beginning of the BytesIO object
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(input_bytes, parser)
# Convert the XML tree to a string without pretty printing (no newlines or indentation)
compressed_xml = etree.tostring(tree, pretty_print=False, encoding='utf-8')
# Write the compressed XML to the BytesIO output
output_bytesio = io.BytesIO(compressed_xml)
# Reset the pointer of the output BytesIO to the beginning
output_bytesio.seek(0)
return output_bytesio
Demonstration
test_xml = """<root>
<parent>
<child>
<subchild> This is some text with irregular spacing. </subchild>
<subchild>Another piece of text with
newlines and tabs.</subchild>
</child>
<child>
<subchild>Text with
multiple
lines.</subchild>
<subchild> Leading and trailing spaces </subchild>
</child>
</parent>
<parent>
<child>
<subchild>Mixed whitespace types.</subchild>
<subchild> </subchild>
</child>
</parent>
</root>"""
test_xml_bytesio = io.BytesIO(test_xml.encode('utf-8'))
compress_xml_whitespace(test_xml_bytesio).read().decode('utf-8')
Output:
'<root><parent><child><subchild> This is some text with irregular spacing. </subchild><subchild>Another piece of text with\n newlines and tabs.</subchild></child><child><subchild>Text with\n multiple\n lines.</subchild><subchild> Leading and trailing spaces </subchild></child></parent><parent><child><subchild>Mixed whitespace types.</subchild><subchild> </subchild></child></parent></root>'