Simple & fast LXML-based TCX parser for Python


import pandas as pd
from lxml import etree
from io import BytesIO

def parse_tcx(tcx: bytes):
    # Parse the XML
    parser = etree.XMLParser(remove_blank_text=True) 
    root = etree.parse(BytesIO(tcx), parser=parser).getroot()
    
    # Define the namespace dictionary
    namespaces = {
        'ns': 'http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2',
        'ns3': 'http://www.garmin.com/xmlschemas/ActivityExtension/v2'
    }
    
    # Extract trackpoints
    trackpoints = root.xpath('//ns:Trackpoint', namespaces=namespaces)
    
    data = []
    for tp in trackpoints:
        point = {}
        
        # Extract basic information
        point['Time'] = tp.findtext('ns:Time', namespaces=namespaces)
        point['LatitudeDegrees'] = tp.xpath('ns:Position/ns:LatitudeDegrees/text()', namespaces=namespaces)
        point['LongitudeDegrees'] = tp.xpath('ns:Position/ns:LongitudeDegrees/text()', namespaces=namespaces)
        point['AltitudeMeters'] = tp.findtext('ns:AltitudeMeters', namespaces=namespaces)
        point['DistanceMeters'] = tp.findtext('ns:DistanceMeters', namespaces=namespaces)
        point['HeartRateBpm'] = tp.xpath('ns:HeartRateBpm/ns:Value/text()', namespaces=namespaces)
        point['Cadence'] = tp.findtext('ns:Cadence', namespaces=namespaces)
        
        # Extract extension data
        point['Speed'] = tp.xpath('ns:Extensions/ns3:TPX/ns3:Speed/text()', namespaces=namespaces)
        point['Power'] = tp.xpath('ns:Extensions/ns3:TPX/ns3:Watts/text()', namespaces=namespaces)
        
        # Convert lists to single values or None
        for key, value in point.items():
            if value is None:
                point[key] = None
            elif isinstance(value, list):
                if len(value) == 0:
                    point[key] = None
                else:
                    point[key] = value[0]
            else:
                point[key] = value
        
        data.append(point)

    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Convert data types
    numeric_columns = ['LatitudeDegrees', 'LongitudeDegrees', 'AltitudeMeters', 'DistanceMeters', 'HeartRateBpm', 'Cadence', 'Speed', 'Power']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    df['Time'] = pd.to_datetime(df['Time'])

    df.set_index('Time', inplace=True)
    
    return df