Python script to compare directory, checking for added/deleted/changes files
The following Python script compares two directories and reports added, deleted, or changed files and directories (printed in color for consoles that support ANSI escape codes)
compare_directories.py
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2025 Uli Köhler <[email protected]>
# SPDX-License-Identifier: CC0-1.0
#
# Maintainers: KiCad Library Team
"""Compare two directories and report added, deleted, or changed entries."""
from __future__ import annotations
import argparse
import concurrent.futures
import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Sequence, Set, Tuple
@dataclass
class DirectoryDifferences:
added_files: List[str]
deleted_files: List[str]
changed_files: List[str]
added_dirs: List[str]
deleted_dirs: List[str]
changed_dirs: List[str]
unchanged_files: List[str]
@property
def has_differences(self) -> bool:
return any([
self.added_files,
self.deleted_files,
self.changed_files,
self.added_dirs,
self.deleted_dirs,
self.changed_dirs,
])
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Compare two directories and report added, deleted, or changed entries."
)
parser.add_argument("reference", type=Path, help="Original directory to compare from")
parser.add_argument("candidate", type=Path, help="Directory to compare against the reference")
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Also list files that are identical in both directories",
)
return parser.parse_args()
def ensure_directory(path: Path, role: str) -> None:
if not path.exists():
raise FileNotFoundError(f"{role} directory '{path}' does not exist")
if not path.is_dir():
raise NotADirectoryError(f"{role} path '{path}' is not a directory")
def relative_entries(root: Path) -> Tuple[Dict[str, Path], Set[str]]:
files: Dict[str, Path] = {}
directories: Set[str] = set()
for dirpath, dirnames, filenames in os.walk(root):
current = Path(dirpath)
rel_dir = current.relative_to(root)
if rel_dir != Path('.'):
directories.add(rel_dir.as_posix())
for dirname in dirnames:
rel = (current / dirname).relative_to(root).as_posix()
directories.add(rel)
for filename in filenames:
abs_path = current / filename
rel_path = abs_path.relative_to(root).as_posix()
files[rel_path] = abs_path
return files, directories
def sha256sum(path: Path) -> str:
completed = subprocess.run(
["sha256sum", str(path)],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
return completed.stdout.split()[0]
def compute_checksums(root: Path, rel_paths: Sequence[str]) -> Dict[str, str]:
results: Dict[str, str] = {}
max_workers = min(16, os.cpu_count() or 1)
def worker(rel_path: str) -> Tuple[str, str]:
return rel_path, sha256sum(root / rel_path)
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_map = {executor.submit(worker, rel): rel for rel in rel_paths}
for future in concurrent.futures.as_completed(future_map):
rel, digest = future.result()
results[rel] = digest
return results
def detect_differences(
ref_files: Dict[str, Path],
ref_dirs: Set[str],
cand_files: Dict[str, Path],
cand_dirs: Set[str],
ref_root: Path,
cand_root: Path,
) -> DirectoryDifferences:
ref_file_set = set(ref_files)
cand_file_set = set(cand_files)
ref_dir_set = set(ref_dirs)
cand_dir_set = set(cand_dirs)
added_files = sorted(cand_file_set - ref_file_set)
deleted_files = sorted(ref_file_set - cand_file_set)
common_files = sorted(ref_file_set & cand_file_set)
added_dirs = sorted(cand_dir_set - ref_dir_set)
deleted_dirs = sorted(ref_dir_set - cand_dir_set)
if common_files:
ref_checksums = compute_checksums(ref_root, common_files)
cand_checksums = compute_checksums(cand_root, common_files)
changed_files = []
unchanged_files = []
for rel in common_files:
if ref_checksums.get(rel) != cand_checksums.get(rel):
changed_files.append(rel)
else:
unchanged_files.append(rel)
else:
changed_files = []
unchanged_files = []
changed_dirs = sorted({
rel_parent
for rel in changed_files + added_files + deleted_files
for rel_parent in parent_paths(rel)
if rel_parent
} & (ref_dir_set & cand_dir_set))
return DirectoryDifferences(
added_files=added_files,
deleted_files=deleted_files,
changed_files=changed_files,
added_dirs=added_dirs,
deleted_dirs=deleted_dirs,
changed_dirs=changed_dirs,
unchanged_files=unchanged_files,
)
def parent_paths(rel_path: str) -> List[str]:
parts = Path(rel_path).parts
parents: List[str] = []
for idx in range(1, len(parts)):
parents.append(Path(*parts[:idx]).as_posix())
return parents
class Palette:
def __init__(self, enabled: bool) -> None:
self.green = "\033[32m" if enabled else ""
self.yellow = "\033[33m" if enabled else ""
self.red = "\033[31m" if enabled else ""
self.reset = "\033[0m" if enabled else ""
def wrap(self, text: str, color: str) -> str:
return f"{color}{text}{self.reset}" if color else text
def supports_color() -> bool:
return sys.stdout.isatty() and os.environ.get("NO_COLOR") is None
def report(
palette: Palette,
summary: DirectoryDifferences,
show_unchanged: bool,
) -> None:
if not summary.has_differences:
print("No differences detected.")
if not show_unchanged or not summary.unchanged_files:
return
for rel in summary.added_dirs:
print(palette.wrap(f"ADDED dir: {rel}/", palette.green))
for rel in summary.added_files:
print(palette.wrap(f"ADDED file: {rel}", palette.green))
for rel in summary.deleted_dirs:
print(palette.wrap(f"DELETED dir: {rel}/", palette.red))
for rel in summary.deleted_files:
print(palette.wrap(f"DELETED file: {rel}", palette.red))
for rel in summary.changed_dirs:
print(palette.wrap(f"CHANGED dir: {rel}/", palette.yellow))
for rel in summary.changed_files:
print(palette.wrap(f"CHANGED file: {rel}", palette.yellow))
if show_unchanged and summary.unchanged_files:
for rel in summary.unchanged_files:
print(f"UNCHANGED file: {rel}")
def main() -> int:
args = parse_args()
ref_root = args.reference.resolve()
cand_root = args.candidate.resolve()
try:
ensure_directory(ref_root, "Reference")
ensure_directory(cand_root, "Candidate")
except (FileNotFoundError, NotADirectoryError) as exc:
print(exc, file=sys.stderr)
return 2
ref_files, ref_dirs = relative_entries(ref_root)
cand_files, cand_dirs = relative_entries(cand_root)
try:
summary = detect_differences(ref_files, ref_dirs, cand_files, cand_dirs, ref_root, cand_root)
except FileNotFoundError:
print("sha256sum executable not found in PATH", file=sys.stderr)
return 2
except subprocess.CalledProcessError as exc:
target = exc.cmd[-1] if exc.cmd else "unknown"
detail = exc.stderr.strip() if isinstance(exc.stderr, str) else ""
print(f"sha256sum failed for '{target}': {detail}", file=sys.stderr)
return 2
palette = Palette(supports_color())
report(palette, summary, args.verbose)
return -1 if summary.has_differences else 0
if __name__ == "__main__":
sys.exit(main())Check out similar posts by category:
Python
If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow