递归查找和计算文件名中单词标记的 Python 脚本
此脚本可用于显示任何给定目录树中文件名组件中存在哪些单词。它显示每个单词在文件名中出现的次数。脚本使用 os 和 collections 模块遍历目录树并计算单词。
filename_token_counter.py
#!/usr/bin/env python3
import os
import re
from collections import Counter
import argparse
def list_files_recursively(directory):
"""List all files recursively in the given directory."""
for root, _, files in os.walk(directory):
for file in files:
yield os.path.join(root, file)
def process_filename(filename, lower_tokens=False, ignore_numeric=False):
"""Strip extensions, tokenize the filename, and return tokens."""
# Strip extension
name_without_extension = os.path.splitext(os.path.basename(filename))[0]
# Tokenize by whitespace, dashes, and underscores
tokens = re.split(r'[\s\-_]+', name_without_extension)
# Remove empty tokens
tokens = [token for token in tokens if token]
# Optionally convert to lowercase
if lower_tokens:
tokens = [token.lower() for token in tokens]
# Optionally ignore all-numeric tokens
if ignore_numeric:
tokens = [token for token in tokens if not token.isnumeric()]
return tokens
def build_token_counter(directory, lower_tokens=False, ignore_numeric=False):
"""Build a Counter of tokens from all filenames in the directory."""
counter = Counter()
for file_path in list_files_recursively(directory):
tokens = process_filename(file_path, lower_tokens, ignore_numeric)
counter.update(tokens)
return counter
def main():
# Parse arguments
parser = argparse.ArgumentParser(description="Tokenize filenames and count token occurrences.")
parser.add_argument("directory", help="The directory to scan.")
parser.add_argument("-s", action="store_true", help="Case-sensitive tokenization (default is case-insensitive).")
parser.add_argument("-N", action="store_true", help="Ignore all-numeric tokens.")
args = parser.parse_args()
# Build token counter
token_counter = build_token_counter(args.directory, lower_tokens=not args.s, ignore_numeric=args.N)
# Print tokens sorted with the most common at the bottom
print("\nToken counts (most common at the bottom):")
for token, count in sorted(token_counter.items(), key=lambda item: (item[1], item[0])):
print(f"{token}: {count}")
if __name__ == "__main__":
main()Check out similar posts by category:
Python
If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow