Python script to find & count word tokens in filenames recursively
This script can be used to show you what words exist in filename components in any given directory tree. It shows how many times each word occurs in the filenames. The script uses the os
and collections
modules to traverse the directory tree and count the words.
#!/usr/bin/env python3
import os
import re
from collections import Counter
import argparse
def list_files_recursively(directory):
"""List all files recursively in the given directory."""
for root, _, files in os.walk(directory):
for file in files:
yield os.path.join(root, file)
def process_filename(filename, lower_tokens=False, ignore_numeric=False):
"""Strip extensions, tokenize the filename, and return tokens."""
# Strip extension
name_without_extension = os.path.splitext(os.path.basename(filename))[0]
# Tokenize by whitespace, dashes, and underscores
tokens = re.split(r'[\s\-_]+', name_without_extension)
# Remove empty tokens
tokens = [token for token in tokens if token]
# Optionally convert to lowercase
if lower_tokens:
tokens = [token.lower() for token in tokens]
# Optionally ignore all-numeric tokens
if ignore_numeric:
tokens = [token for token in tokens if not token.isnumeric()]
return tokens
def build_token_counter(directory, lower_tokens=False, ignore_numeric=False):
"""Build a Counter of tokens from all filenames in the directory."""
counter = Counter()
for file_path in list_files_recursively(directory):
tokens = process_filename(file_path, lower_tokens, ignore_numeric)
counter.update(tokens)
return counter
def main():
# Parse arguments
parser = argparse.ArgumentParser(description="Tokenize filenames and count token occurrences.")
parser.add_argument("directory", help="The directory to scan.")
parser.add_argument("-s", action="store_true", help="Case-sensitive tokenization (default is case-insensitive).")
parser.add_argument("-N", action="store_true", help="Ignore all-numeric tokens.")
args = parser.parse_args()
# Build token counter
token_counter = build_token_counter(args.directory, lower_tokens=not args.s, ignore_numeric=args.N)
# Print tokens sorted with the most common at the bottom
print("\nToken counts (most common at the bottom):")
for token, count in sorted(token_counter.items(), key=lambda item: (item[1], item[0])):
print(f"{token}: {count}")
if __name__ == "__main__":
main()
If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow