From 91e5b6178e8e59ddb6d3f4233750573c9645a7f3 Mon Sep 17 00:00:00 2001 From: ppkantorski <6467366+ppkantorski@users.noreply.github.com> Date: Sat, 31 May 2025 07:17:11 -0700 Subject: [PATCH] Update format_repo2.py --- scripts/StevensND/format_repo2.py | 152 +++++++++++------------------- 1 file changed, 56 insertions(+), 96 deletions(-) diff --git a/scripts/StevensND/format_repo2.py b/scripts/StevensND/format_repo2.py index b8fb09b..2da2627 100644 --- a/scripts/StevensND/format_repo2.py +++ b/scripts/StevensND/format_repo2.py @@ -5,13 +5,9 @@ import re import sys import unicodedata -# ----- Normalization & Title‐Casing Helpers (same as other repos) ----- +# ----- Normalization & Title‐Casing Helpers ----- def sanitize_name(name): - """ - Remove accents and unwanted characters, replace ' - ' with a single space, - and collapse multiple spaces. - """ normalized = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii') cleaned = normalized.replace("'", "").replace("’", "").replace("`", "").replace('"', "") cleaned = cleaned.replace(" - ", " ") @@ -19,9 +15,6 @@ def sanitize_name(name): return cleaned.strip() def capitalize_hyphenated(word): - """ - Capitalize both parts of a hyphenated word. E.g. "yooka-laylee" → "Yooka-Laylee". - """ parts = word.split('-') capitalized_parts = [] for part in parts: @@ -31,44 +24,27 @@ def capitalize_hyphenated(word): capitalized_parts.append('') return '-'.join(capitalized_parts) -# Regex for Roman numerals up to 3999 ROMAN_NUMERAL_PATTERN = re.compile( r"^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", re.IGNORECASE ) -# Known uppercase acronyms we want to preserve exactly ACRONYMS = { "HD", "2D", "3D", "4K", "VR", "AI", "API", "USB", "CPU", "GPU", "DVD", "CD", - "RPG", "FPS", "MMO", "MMORPG", "LAN", "GUI", "NPC", "FFVII", "FFVIII", "FX", "FFIX", "FFX", "FFXII", "2K", "V1", "V2", "V3", "V4" + "RPG", "FPS", "MMO", "MMORPG", "LAN", "GUI", "NPC", + "FFVII", "FFVIII", "FFIX", "FFX", "FFXII", + "FX", "2K", "V1", "V2", "V3", "V4" } def is_roman_numeral(word): - """ - Return True if `word` is a valid Roman numeral (case‐insensitive). - """ return bool(ROMAN_NUMERAL_PATTERN.match(word)) def title_case_preserve_numbers(name): - """ - Title-case with these rules: - • Fully uppercase acronyms remain unchanged (e.g. HD, 2D, 3D, FFVII, etc.). - • Roman numerals become fully uppercase (e.g. 'iii' → 'III', 'xI' → 'XI'). - • Hyphenated words are capitalized on both sides (→ 'Yooka-Laylee'). - • Small filler words (a, an, and, the, of, in, etc.) become lowercase - only if they appear in the middle and are not immediately after a subtitle marker, - except the first and last words (always capitalized). - • After a subtitle marker (":", "~", "–", "—", or "-"), force capitalization - on all subsequent words until the next subtitle marker or end. - • Compound Roman numerals joined by &, +, or | become fully uppercase - (e.g. "I&ii" → "I&II"). - """ lowercase_exceptions = { "a", "an", "and", "as", "at", "but", "by", "for", "from", "in", "nor", "of", "on", "or", "so", "the", "to", "with", "yet" } subtitle_markers = {":", "~", "-", "–", "—"} - words = name.split() result = [] force_capitalize_mode = False @@ -80,7 +56,6 @@ def title_case_preserve_numbers(name): for part in split_parts: if part in subtitle_markers: - # Keep marker, force next segments to capitalize rebuilt_parts.append(part) force_capitalize_mode = True continue @@ -94,7 +69,6 @@ def title_case_preserve_numbers(name): return subword.upper() if is_roman_numeral(subword): return subword.upper() - # Check for compound Roman numerals joined by &, +, | for sep in ('&', '+', '|'): if sep in subword: pieces = subword.split(sep) @@ -104,7 +78,7 @@ def title_case_preserve_numbers(name): if force_capitalize_mode or is_first or is_last or (lower_part not in lowercase_exceptions): sub_parts = part.split('-') - rebuilt_parts.append('-'.join(capitalize_special(p) for p in sub_parts)) + rebuilt_parts.append('-'.join(capitalize_special(s) for s in sub_parts)) else: rebuilt_parts.append(lower_part) @@ -112,58 +86,41 @@ def title_case_preserve_numbers(name): if not contains_marker: force_capitalize_mode = False - # Always capitalize first and last words fully: if result: - # First word first_split = result[0].split('-') - new_first = [] - for p in first_split: - if p.upper() in ACRONYMS or is_roman_numeral(p): - new_first.append(p.upper()) - else: - new_first.append(capitalize_hyphenated(p)) - result[0] = '-'.join(new_first) + result[0] = "-".join(capitalize_hyphenated(p) if not is_roman_numeral(p) else p.upper() for p in first_split) - # Last word last_split = result[-1].split('-') - new_last = [] - for p in last_split: - if p.upper() in ACRONYMS or is_roman_numeral(p): - new_last.append(p.upper()) - else: - new_last.append(capitalize_hyphenated(p)) - result[-1] = '-'.join(new_last) + result[-1] = "-".join(capitalize_hyphenated(p) if not is_roman_numeral(p) else p.upper() for p in last_split) - return ' '.join(result) + return " ".join(result) def clean_title(name): - """ - Run sanitize_name() → title_case_preserve_numbers() in one shot. - """ return title_case_preserve_numbers(sanitize_name(name)) -# ----- Revised get_game_name_and_mod_name & Structure Logic ----- +# ----- Game & Mod Name Logic ----- + +def strip_versions(text): + """ + Remove any substrings that look like version numbers, e.g.: + - 1.0, 1.2.3 + - v1.0, v2.3.4 + """ + return re.sub(r'\b(v?\d+(?:\.\d+){1,2})\b', '', text, flags=re.IGNORECASE).strip() + def get_game_name_and_mod_name(path, root_dir): - """ - Given a folder path where a .pchtxt lives, derive: - • game_name (honoring "[Country]" folders, ", The", etc.), then clean_title(). - • mod_name (join of all sub‐folders except game, handling Aspect Ratio, - version suffix " vX", etc.), then clean_title(). - """ relative_path = os.path.relpath(path, root_dir) parts = relative_path.split(os.sep) - # 1) Derive raw game folder (strip [tags], handle ", The") raw_game = parts[0] raw_game = re.sub(r'\[.*?\]', '', raw_game).strip() - if ', The' in raw_game: - p = raw_game.split(', The') + if ", The" in raw_game: + p = raw_game.split(", The") raw_game = f"The {p[0]}{p[1]}" raw_game = raw_game.replace(" - ", " ") - # 2) Check for country folder like "[USA]" etc. country = None for p in parts[1:]: if re.search(r'\[.*?\]', p): @@ -171,64 +128,66 @@ def get_game_name_and_mod_name(path, root_dir): break if country: raw_game = f"{raw_game} ({country})" - - # 3) Clean/title-case the game name: game_name = clean_title(raw_game) - # 4) Derive raw mod_name - # a) If path contains "Aspect Ratio" segment, use that - if 'Aspect Ratio' in relative_path: + sub_folders = [ re.sub(r'\[.*?\]', '', p).strip() for p in parts[1:] ] + sub_folders = [sf for sf in sub_folders if sf.lower() != "pchtxt"] + + if "Aspect Ratio" in relative_path: aspect_folder = os.path.basename(path) raw_mod = f"Aspect Ratio {aspect_folder}" else: - # b) Otherwise join all folder parts after the first as mod components - mod_parts = [re.sub(r'\[.*?\]', '', p).strip() for p in parts[1:]] - if mod_parts: - raw_mod = " ".join(mod_parts) + if sub_folders: + m = re.match(r'^([0-9]+(?:\.[0-9]+)*)\s*(.*)$', sub_folders[0]) + if m: + trailing = m.group(2).strip() + if trailing: + sub_folders[0] = trailing + else: + sub_folders = sub_folders[1:] + + if country and sub_folders: + prefix = country.lower() + candidate = sub_folders[0].lower() + if candidate.startswith(prefix): + sub_folders[0] = sub_folders[0][len(country):].lstrip() + + if sub_folders: + raw_mod = " ".join(sub_folders).strip() else: raw_mod = "" - # c) If raw_mod ends in " v" → preserve version suffix as part of mod_name - m = re.search(r'(.*) v[0-9.]+$', raw_mod) - if m: - raw_mod = m.group(1) + raw_mod = strip_versions(raw_mod) + m2 = re.match(r'^(.*)\s+v[0-9.]+$', raw_mod, re.IGNORECASE) + if m2: + raw_mod = m2.group(1).strip() - raw_mod = raw_mod.strip() - - # 5) Clean/title-case the mod_name: mod_name = clean_title(raw_mod) if raw_mod else "" - return game_name, mod_name +# ----- File Structure Logic ----- + def create_formatted_structure(folder_path): - """ - Walk `folder_path` for all `.pchtxt` files. For each: - 1) Derive (game_name, mod_name) via get_game_name_and_mod_name. - 2) Place the file under formatted/" - "/".pchtxt". - """ - formatted_path = os.path.join(folder_path, 'formatted') + formatted_path = os.path.join(folder_path, "formatted") os.makedirs(formatted_path, exist_ok=True) print(f"Creating formatted structure at: {formatted_path}\n") for root, dirs, files in os.walk(folder_path): - # Skip anything already under "formatted" - if 'formatted' in root.split(os.sep): + if "formatted" in root.split(os.sep): continue - for file in files: - if not file.lower().endswith('.pchtxt'): + for filename in files: + if not filename.lower().endswith(".pchtxt"): continue game_name, mod_name = get_game_name_and_mod_name(root, folder_path) - version = file[:-len('.pchtxt')].strip() - - # If mod_name ended up empty, we still create "Game Name - " - combined_dir_name = f"{game_name} - {mod_name}".rstrip() - new_dir = os.path.join(formatted_path, combined_dir_name) + version = filename[:-len(".pchtxt")].strip() + combined_dir = f"{game_name} - {mod_name}".rstrip() + new_dir = os.path.join(formatted_path, combined_dir) os.makedirs(new_dir, exist_ok=True) - src = os.path.join(root, file) + src = os.path.join(root, filename) dst = os.path.join(new_dir, f"{version}.pchtxt") shutil.copy(src, dst) print(f"Copied {src} → {dst}") @@ -240,6 +199,7 @@ def main(folder_path): create_formatted_structure(folder_path) print("All files have been organized successfully.") + if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python format_repo_2.py /path/to/folder/")