Python script to compare directory, checking for added/deleted/changes files

The following Python script compares two directories and reports added, deleted, or changed files and directories (printed in color for consoles that support ANSI escape codes)

compare_directories.py
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2025 Uli Köhler <[email protected]>
# SPDX-License-Identifier: CC0-1.0
#
# Maintainers: KiCad Library Team
"""Compare two directories and report added, deleted, or changed entries."""

from __future__ import annotations

import argparse
import concurrent.futures
import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Sequence, Set, Tuple


@dataclass
class DirectoryDifferences:
    added_files: List[str]
    deleted_files: List[str]
    changed_files: List[str]
    added_dirs: List[str]
    deleted_dirs: List[str]
    changed_dirs: List[str]
    unchanged_files: List[str]

    @property
    def has_differences(self) -> bool:
        return any([
            self.added_files,
            self.deleted_files,
            self.changed_files,
            self.added_dirs,
            self.deleted_dirs,
            self.changed_dirs,
        ])


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Compare two directories and report added, deleted, or changed entries."
    )
    parser.add_argument("reference", type=Path, help="Original directory to compare from")
    parser.add_argument("candidate", type=Path, help="Directory to compare against the reference")
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="Also list files that are identical in both directories",
    )
    return parser.parse_args()


def ensure_directory(path: Path, role: str) -> None:
    if not path.exists():
        raise FileNotFoundError(f"{role} directory '{path}' does not exist")
    if not path.is_dir():
        raise NotADirectoryError(f"{role} path '{path}' is not a directory")


def relative_entries(root: Path) -> Tuple[Dict[str, Path], Set[str]]:
    files: Dict[str, Path] = {}
    directories: Set[str] = set()
    for dirpath, dirnames, filenames in os.walk(root):
        current = Path(dirpath)
        rel_dir = current.relative_to(root)
        if rel_dir != Path('.'):
            directories.add(rel_dir.as_posix())
        for dirname in dirnames:
            rel = (current / dirname).relative_to(root).as_posix()
            directories.add(rel)
        for filename in filenames:
            abs_path = current / filename
            rel_path = abs_path.relative_to(root).as_posix()
            files[rel_path] = abs_path
    return files, directories


def sha256sum(path: Path) -> str:
    completed = subprocess.run(
        ["sha256sum", str(path)],
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
    return completed.stdout.split()[0]


def compute_checksums(root: Path, rel_paths: Sequence[str]) -> Dict[str, str]:
    results: Dict[str, str] = {}
    max_workers = min(16, os.cpu_count() or 1)

    def worker(rel_path: str) -> Tuple[str, str]:
        return rel_path, sha256sum(root / rel_path)

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_map = {executor.submit(worker, rel): rel for rel in rel_paths}
        for future in concurrent.futures.as_completed(future_map):
            rel, digest = future.result()
            results[rel] = digest
    return results


def detect_differences(
    ref_files: Dict[str, Path],
    ref_dirs: Set[str],
    cand_files: Dict[str, Path],
    cand_dirs: Set[str],
    ref_root: Path,
    cand_root: Path,
) -> DirectoryDifferences:
    ref_file_set = set(ref_files)
    cand_file_set = set(cand_files)
    ref_dir_set = set(ref_dirs)
    cand_dir_set = set(cand_dirs)

    added_files = sorted(cand_file_set - ref_file_set)
    deleted_files = sorted(ref_file_set - cand_file_set)
    common_files = sorted(ref_file_set & cand_file_set)

    added_dirs = sorted(cand_dir_set - ref_dir_set)
    deleted_dirs = sorted(ref_dir_set - cand_dir_set)

    if common_files:
        ref_checksums = compute_checksums(ref_root, common_files)
        cand_checksums = compute_checksums(cand_root, common_files)
        changed_files = []
        unchanged_files = []
        for rel in common_files:
            if ref_checksums.get(rel) != cand_checksums.get(rel):
                changed_files.append(rel)
            else:
                unchanged_files.append(rel)
    else:
        changed_files = []
        unchanged_files = []

    changed_dirs = sorted({
        rel_parent
        for rel in changed_files + added_files + deleted_files
        for rel_parent in parent_paths(rel)
        if rel_parent
    } & (ref_dir_set & cand_dir_set))

    return DirectoryDifferences(
        added_files=added_files,
        deleted_files=deleted_files,
        changed_files=changed_files,
        added_dirs=added_dirs,
        deleted_dirs=deleted_dirs,
        changed_dirs=changed_dirs,
        unchanged_files=unchanged_files,
    )


def parent_paths(rel_path: str) -> List[str]:
    parts = Path(rel_path).parts
    parents: List[str] = []
    for idx in range(1, len(parts)):
        parents.append(Path(*parts[:idx]).as_posix())
    return parents


class Palette:
    def __init__(self, enabled: bool) -> None:
        self.green = "\033[32m" if enabled else ""
        self.yellow = "\033[33m" if enabled else ""
        self.red = "\033[31m" if enabled else ""
        self.reset = "\033[0m" if enabled else ""

    def wrap(self, text: str, color: str) -> str:
        return f"{color}{text}{self.reset}" if color else text


def supports_color() -> bool:
    return sys.stdout.isatty() and os.environ.get("NO_COLOR") is None


def report(
    palette: Palette,
    summary: DirectoryDifferences,
    show_unchanged: bool,
) -> None:
    if not summary.has_differences:
        print("No differences detected.")
        if not show_unchanged or not summary.unchanged_files:
            return

    for rel in summary.added_dirs:
        print(palette.wrap(f"ADDED dir: {rel}/", palette.green))
    for rel in summary.added_files:
        print(palette.wrap(f"ADDED file: {rel}", palette.green))

    for rel in summary.deleted_dirs:
        print(palette.wrap(f"DELETED dir: {rel}/", palette.red))
    for rel in summary.deleted_files:
        print(palette.wrap(f"DELETED file: {rel}", palette.red))

    for rel in summary.changed_dirs:
        print(palette.wrap(f"CHANGED dir: {rel}/", palette.yellow))
    for rel in summary.changed_files:
        print(palette.wrap(f"CHANGED file: {rel}", palette.yellow))

    if show_unchanged and summary.unchanged_files:
        for rel in summary.unchanged_files:
            print(f"UNCHANGED file: {rel}")


def main() -> int:
    args = parse_args()
    ref_root = args.reference.resolve()
    cand_root = args.candidate.resolve()

    try:
        ensure_directory(ref_root, "Reference")
        ensure_directory(cand_root, "Candidate")
    except (FileNotFoundError, NotADirectoryError) as exc:
        print(exc, file=sys.stderr)
        return 2

    ref_files, ref_dirs = relative_entries(ref_root)
    cand_files, cand_dirs = relative_entries(cand_root)

    try:
        summary = detect_differences(ref_files, ref_dirs, cand_files, cand_dirs, ref_root, cand_root)
    except FileNotFoundError:
        print("sha256sum executable not found in PATH", file=sys.stderr)
        return 2
    except subprocess.CalledProcessError as exc:
        target = exc.cmd[-1] if exc.cmd else "unknown"
        detail = exc.stderr.strip() if isinstance(exc.stderr, str) else ""
        print(f"sha256sum failed for '{target}': {detail}", file=sys.stderr)
        return 2

    palette = Palette(supports_color())
    report(palette, summary, args.verbose)
    return -1 if summary.has_differences else 0


if __name__ == "__main__":
    sys.exit(main())

Check out similar posts by category: Python