How to fix landscape-package-reporter: UnicodeDecodeError: 'utf-8' codec can't decode byte

On some servers attached to a landscape instance, I encountered this stacktrace when trying to run sudo landscape-package-reporter:

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 653, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "/usr/lib/python3/dist-packages/landscape/client/package/reporter.py", line 92, in <lambda>
    result.addCallback(lambda x: self.request_unknown_hashes())
  File "/usr/lib/python3/dist-packages/landscape/client/package/reporter.py", line 485, in request_unknown_hashes
    self._facade.ensure_channels_reloaded()
  File "/usr/lib/python3/dist-packages/landscape/lib/apt/package/facade.py", line 265, in ensure_channels_reloaded
    self.reload_channels()
  File "/usr/lib/python3/dist-packages/landscape/lib/apt/package/facade.py", line 253, in reload_channels
    version, with_info=False).get_hash()
  File "/usr/lib/python3/dist-packages/landscape/lib/apt/package/facade.py", line 402, in get_package_skeleton
    return build_skeleton_apt(pkg, with_info=with_info, with_unicode=True)
  File "/usr/lib/python3/dist-packages/landscape/lib/apt/package/skeleton.py", line 131, in build_skeleton_apt
    version.record, "Provides", DEB_PROVIDES))
  File "/usr/lib/python3/dist-packages/apt/package.py", line 690, in record
    return Record(self._records.record)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 724: invalid start byte

Tracing down the issue, it was related with a misplaced set of Unicode bytes (EB BF BD) in an old veeam version in version 1.0.0.944 of the veeamsnap package in /var/lib/apt/lists/repository.veeam.com_backup_linux_agent_dpkg_debian_public_dists_stable_veeam_binary-amd64_Packages: The Description field contains this text:

[...] Linux � simple [...]

The strange character is the U+FFFD � REPLACEMENT CHARACTER.

You can fix it by deleting this character. It’s just at the end of /var/lib/apt/lists/repository.veeam.com_backup_linux_agent_dpkg_debian_public_dists_stable_veeam_binary-amd64_Packages. However, if there’s an update for that repository, your change will be overwritten.

In order to fix it (my fix is for landscape-client version 18.01-0ubuntu3.5), I added a try: ... except: ... clause to skeleton.py, which will ignore some properties of the package where the issue occurs:

try:
    relations.update(parse_record_field(
        version.record, "Provides", DEB_PROVIDES))
    relations.add((
        DEB_NAME_PROVIDES,
        "%s = %s" % (version.package.name, version.version)))
    relations.update(parse_record_field(
        version.record, "Pre-Depends", DEB_REQUIRES, DEB_OR_REQUIRES))
    relations.update(parse_record_field(
        version.record, "Depends", DEB_REQUIRES, DEB_OR_REQUIRES))

    relations.add((
        DEB_UPGRADES, "%s < %s" % (version.package.name, version.version)))

    relations.update(parse_record_field(
        version.record, "Conflicts", DEB_CONFLICTS))
    relations.update(parse_record_field(
        version.record, "Breaks", DEB_CONFLICTS))
    skeleton.relations = sorted(relations)

    if with_info:
        skeleton.section = version.section
        skeleton.summary = version.summary
        skeleton.description = version.description
        skeleton.size = version.size
        if version.installed_size > 0:
            skeleton.installed_size = version.installed_size
        if with_unicode and not _PY3:
            skeleton.section = skeleton.section.decode("utf-8")
            skeleton.summary = skeleton.summary.decode("utf-8")
            # Avoid double-decoding package descriptions in build_skeleton_apt,
            # which causes an error with newer python-apt (Xenial onwards)
            if not isinstance(skeleton.description, unicode):
                skeleton.description = skeleton.description.decode("utf-8")
    return skeleton
except UnicodeError:
    return skeleton

Replace /usr/lib/python3/dist-packages/landscape/lib/apt/package/skeleton.py by this:

from landscape.lib.hashlib import sha1

import apt_pkg

from twisted.python.compat import unicode, _PY3


PACKAGE   = 1 << 0
PROVIDES  = 1 << 1
REQUIRES  = 1 << 2
UPGRADES  = 1 << 3
CONFLICTS = 1 << 4

DEB_PACKAGE       = 1 << 16 | PACKAGE
DEB_PROVIDES      = 2 << 16 | PROVIDES
DEB_NAME_PROVIDES = 3 << 16 | PROVIDES
DEB_REQUIRES      = 4 << 16 | REQUIRES
DEB_OR_REQUIRES   = 5 << 16 | REQUIRES
DEB_UPGRADES      = 6 << 16 | UPGRADES
DEB_CONFLICTS     = 7 << 16 | CONFLICTS


class PackageTypeError(Exception):
    """Raised when an unsupported package type is passed to build_skeleton."""


class PackageSkeleton(object):

    section = None
    summary = None
    description = None
    size = None
    installed_size = None
    _hash = None

    def __init__(self, type, name, version):
        self.type = type
        self.name = name
        self.version = version
        self.relations = []

    def add_relation(self, type, info):
        self.relations.append((type, info))

    def get_hash(self):
        """Calculate the package hash.

        If C{set_hash} has been used, that hash will be returned and the
        hash won't be the calculated value.
        """
        if self._hash is not None:
            return self._hash
        # We use ascii here as encoding  for backwards compatibility as it was
        # default encoding for conversion from unicode to bytes in Python 2.7.
        package_info = ("[%d %s %s]" % (self.type, self.name, self.version)
                        ).encode("ascii")
        digest = sha1(package_info)
        self.relations.sort()
        for pair in self.relations:
            digest.update(("[%d %s]" % (pair[0], pair[1])
                           ).encode("ascii"))
        return digest.digest()

    def set_hash(self, package_hash):
        """Set the hash to an explicit value.

        This should be used when the hash is previously known and can't
        be calculated from the relations anymore.

        The only use case for this is package resurrection. We're
        planning on getting rid of package resurrection, and this code
        can be removed when that is done.
        """
        self._hash = package_hash


def relation_to_string(relation_tuple):
    """Convert an apt relation to a string representation.

    @param relation_tuple: A tuple, (name, version, relation). version
        and relation can be the empty string, if the relation is on a
        name only.

    Returns something like "name > 1.0"
    """
    name, version, relation_type = relation_tuple
    relation_string = name
    if relation_type:
        relation_string += " %s %s" % (relation_type, version)
    return relation_string


def parse_record_field(record, record_field, relation_type,
                       or_relation_type=None):
    """Parse an apt C{Record} field and return skeleton relations

    @param record: An C{apt.package.Record} instance with package information.
    @param record_field: The name of the record field to parse.
    @param relation_type: The deb relation that can be passed to
        C{skeleton.add_relation()}
    @param or_relation_type: The deb relation that should be used if
        there is more than one value in a relation.
    """
    relations = set()
    values = apt_pkg.parse_depends(record.get(record_field, ""))
    for value in values:
        value_strings = [relation_to_string(relation) for relation in value]
        value_relation_type = relation_type
        if len(value_strings) > 1:
            value_relation_type = or_relation_type
        relation_string = " | ".join(value_strings)
        relations.add((value_relation_type, relation_string))
    return relations


def build_skeleton_apt(version, with_info=False, with_unicode=False):
    """Build a package skeleton from an apt package.

    @param version: An instance of C{apt.package.Version}
    @param with_info: Whether to extract extra information about the
        package, like description, summary, size.
    @param with_unicode: Whether the C{name} and C{version} of the
        skeleton should be unicode strings.
    """
    name, version_string = version.package.name, version.version
    if with_unicode:
        name, version_string = unicode(name), unicode(version_string)
    skeleton = PackageSkeleton(DEB_PACKAGE, name, version_string)
    relations = set()
    try:
        relations.update(parse_record_field(
            version.record, "Provides", DEB_PROVIDES))
        relations.add((
            DEB_NAME_PROVIDES,
            "%s = %s" % (version.package.name, version.version)))
        relations.update(parse_record_field(
            version.record, "Pre-Depends", DEB_REQUIRES, DEB_OR_REQUIRES))
        relations.update(parse_record_field(
            version.record, "Depends", DEB_REQUIRES, DEB_OR_REQUIRES))

        relations.add((
            DEB_UPGRADES, "%s < %s" % (version.package.name, version.version)))

        relations.update(parse_record_field(
            version.record, "Conflicts", DEB_CONFLICTS))
        relations.update(parse_record_field(
            version.record, "Breaks", DEB_CONFLICTS))
        skeleton.relations = sorted(relations)

        if with_info:
            skeleton.section = version.section
            skeleton.summary = version.summary
            skeleton.description = version.description
            skeleton.size = version.size
            if version.installed_size > 0:
                skeleton.installed_size = version.installed_size
            if with_unicode and not _PY3:
                skeleton.section = skeleton.section.decode("utf-8")
                skeleton.summary = skeleton.summary.decode("utf-8")
                # Avoid double-decoding package descriptions in build_skeleton_apt,
                # which causes an error with newer python-apt (Xenial onwards)
                if not isinstance(skeleton.description, unicode):
                    skeleton.description = skeleton.description.decode("utf-8")
        return skeleton
    except UnicodeError:
        return skeleton

After that, you can run sudo landscape-package-reporter again.