Python script to find & count word tokens in filenames recursively

This script can be used to show you what words exist in filename components in any given directory tree. It shows how many times each word occurs in the filenames. The script uses the os and collections modules to traverse the directory tree and count the words.

#!/usr/bin/env python3
import os
import re
from collections import Counter
import argparse

def list_files_recursively(directory):
    """List all files recursively in the given directory."""
    for root, _, files in os.walk(directory):
        for file in files:
            yield os.path.join(root, file)

def process_filename(filename, lower_tokens=False, ignore_numeric=False):
    """Strip extensions, tokenize the filename, and return tokens."""
    # Strip extension
    name_without_extension = os.path.splitext(os.path.basename(filename))[0]
    
    # Tokenize by whitespace, dashes, and underscores
    tokens = re.split(r'[\s\-_]+', name_without_extension)

    # Remove empty tokens
    tokens = [token for token in tokens if token]

    # Optionally convert to lowercase
    if lower_tokens:
        tokens = [token.lower() for token in tokens]

    # Optionally ignore all-numeric tokens
    if ignore_numeric:
        tokens = [token for token in tokens if not token.isnumeric()]

    return tokens

def build_token_counter(directory, lower_tokens=False, ignore_numeric=False):
    """Build a Counter of tokens from all filenames in the directory."""
    counter = Counter()
    for file_path in list_files_recursively(directory):
        tokens = process_filename(file_path, lower_tokens, ignore_numeric)
        counter.update(tokens)
    return counter

def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description="Tokenize filenames and count token occurrences.")
    parser.add_argument("directory", help="The directory to scan.")
    parser.add_argument("-s", action="store_true", help="Case-sensitive tokenization (default is case-insensitive).")
    parser.add_argument("-N", action="store_true", help="Ignore all-numeric tokens.")
    args = parser.parse_args()

    # Build token counter
    token_counter = build_token_counter(args.directory, lower_tokens=not args.s, ignore_numeric=args.N)

    # Print tokens sorted with the most common at the bottom
    print("\nToken counts (most common at the bottom):")
    for token, count in sorted(token_counter.items(), key=lambda item: (item[1], item[0])):
        print(f"{token}: {count}")

if __name__ == "__main__":
    main()

If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow

Buy me a coffee