diff --git a/scripts/cucholix/format_repo5.py b/scripts/cucholix/format_repo5.py index 2112e4b..ff4bcd7 100644 --- a/scripts/cucholix/format_repo5.py +++ b/scripts/cucholix/format_repo5.py @@ -1,73 +1,191 @@ +#!/usr/bin/env python3 import os import shutil import unicodedata import sys +import re def sanitize_name(name): - # Remove accents and unwanted characters + """ + Remove accents and unwanted characters, and replace ' - ' with a single space. + """ normalized = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii') cleaned = normalized.replace("'", "").replace("’", "").replace("`", "").replace('"', '') cleaned = cleaned.replace(" - ", " ") # Remove " - " to avoid duplication - cleaned = ' '.join(cleaned.split()) # Collapse multiple spaces + cleaned = ' '.join(cleaned.split()) # Collapse multiple spaces return cleaned.strip() +def capitalize_hyphenated(word): + """ + Capitalize both parts of a hyphenated word. E.g. "yooka-laylee" → "Yooka-Laylee". + """ + parts = word.split('-') + capitalized_parts = [] + for part in parts: + if part: + capitalized_parts.append(part[0].upper() + part[1:].lower() if len(part) > 1 else part.upper()) + else: + capitalized_parts.append('') + return '-'.join(capitalized_parts) + +# Regex for Roman numerals (supports up to 3999): I, II, III, IV, V, VI, VII, VIII, IX, X, XI, etc. +ROMAN_NUMERAL_PATTERN = re.compile( + r"^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", + re.IGNORECASE +) + +# A set of known acronyms (fully uppercase) that should remain uppercase. +ACRONYMS = { + "HD", "2D", "3D", "4K", "VR", "AI", "API", "USB", "CPU", "GPU", "DVD", "CD", + "RPG", "FPS", "MMO", "MMORPG", "LAN", "GUI", "NPC", "FFVII", "FFVIII", "FX", "FFIX", "FFX", "FFXII" +} + +def is_roman_numeral(word): + """ + Return True if the word is a valid Roman numeral (case-insensitive). + """ + return bool(ROMAN_NUMERAL_PATTERN.match(word)) + def title_case_preserve_numbers(name): - # Capitalize title correctly with exceptions for filler words in the middle + """ + Title-case with these rules: + • Fully uppercase acronyms remain unchanged (e.g. HD, 2D, 3D, FFVII, etc.). + • Roman numerals become fully uppercase (e.g. iIi → III, xI → XI). + • Hyphenated words are capitalized on both sides (→ Yooka-Laylee). + • Conjoined Roman numerals with '&', '+', or '|' become fully uppercase (e.g. I&ii → I&II). + • Small filler words (a, an, and, the, of, in, etc.) become lowercase + only if they appear in the middle and are not immediately after a subtitle marker, + except the first and last words (which always capitalized). + • After a subtitle marker (":", "~", "–", "—", or "-"), force capitalization + on all subsequent words (until the next subtitle marker or end). + """ lowercase_exceptions = { - "a", "an", "and", "as", "at", "but", "by", "for", "from", "in", "nor", - "of", "on", "or", "so", "the", "to", "yet", "with" + "a", "an", "and", "as", "at", "but", "by", "for", "from", + "in", "nor", "of", "on", "or", "so", "the", "to", "with", "yet" } + subtitle_markers = {":", "~", "-", "–", "—"} words = name.split() result = [] + force_capitalize_mode = False # Once True, stays True until next subtitle marker - for i, word in enumerate(words): - if (0 < i < len(words) - 1) and word.lower() in lowercase_exceptions: - result.append(word.lower()) - else: - result.append(word.capitalize()) + for idx, word in enumerate(words): + # Detect if this word contains any subtitle marker character + contains_marker = any(marker in word for marker in subtitle_markers) + + # Split on subtitle markers but keep them in the list + split_parts = re.split(r'([:~\-–—])', word) + capitalized_parts = [] + + for part in split_parts: + if part in subtitle_markers: + # Append the marker itself, then force-capitalize subsequent parts + capitalized_parts.append(part) + force_capitalize_mode = True + continue + + lower_part = part.lower() + is_first = (idx == 0) + is_last = (idx == len(words) - 1) + + # Helper: capitalize a sub-word with special rules for acronyms, roman numerals, and conjoined numerals + def capitalize_special(w): + # If w (case-insensitive) is in our ACRONYMS set, uppercase it fully. + if w.upper() in ACRONYMS: + return w.upper() + # If w alone is a Roman numeral, uppercase it fully. + if is_roman_numeral(w): + return w.upper() + # Handle compound Roman numerals separated by &, +, or | + for sep in ['&', '+', '|']: + if sep in w: + parts = w.split(sep) + if all(is_roman_numeral(p) for p in parts): + return sep.join(p.upper() for p in parts) + # Otherwise, capitalize hyphenated words normally. + return capitalize_hyphenated(w) + + # Decide how to capitalize this segment: + if force_capitalize_mode or is_first or is_last or (lower_part not in lowercase_exceptions): + # Split any hyphens, apply capitalize_special to each half + sub_parts = part.split('-') + capitalized_sub = [capitalize_special(sp) for sp in sub_parts] + capitalized_parts.append('-'.join(capitalized_sub)) + else: + # In-the-middle filler word: keep lowercase + capitalized_parts.append(lower_part) + + result.append(''.join(capitalized_parts)) + + # If this word did not contain a subtitle marker, stop forcing next capitalization + if not contains_marker: + force_capitalize_mode = False + + # Always capitalize the FIRST and LAST words (using the same special rules): + if result: + first_parts = result[0].split('-') + result[0] = '-'.join( + sp.upper() if (sp.upper() in ACRONYMS or is_roman_numeral(sp)) else capitalize_hyphenated(sp) + for sp in first_parts + ) + + last_parts = result[-1].split('-') + result[-1] = '-'.join( + sp.upper() if (sp.upper() in ACRONYMS or is_roman_numeral(sp)) else capitalize_hyphenated(sp) + for sp in last_parts + ) return ' '.join(result) def create_formatted_structure(root_folder): + """ + Walk root_folder for all .pchtxt files. For each one: + 1. Extract folder name → raw game name. + 2. sanitize_name() → remove weird characters. + 3. Remove trailing "Graphics" if present. + 4. title_case_preserve_numbers() to get final Game Name. + 5. Create folder: formatted/ - Graphics Mods/ + 6. Copy each .pchtxt into that folder as .pchtxt. + """ formatted_path = os.path.join(root_folder, 'formatted') os.makedirs(formatted_path, exist_ok=True) - print(f"Creating formatted structure at: {formatted_path}") + print(f"Creating formatted structure at: {formatted_path}\n") - for root, dirs, files in os.walk(root_folder): + for current_root, dirs, files in os.walk(root_folder): for file in files: - if file.endswith('.pchtxt'): - version = file.replace('.pchtxt', '').strip() + if not file.lower().endswith('.pchtxt'): + continue - # Extract game name from parent folder - game_name = os.path.basename(os.path.dirname(os.path.join(root, file))) - game_name = sanitize_name(game_name) + version = file[:-len('.pchtxt')].strip() + parent_dir = os.path.basename(current_root) + game_name = sanitize_name(parent_dir) - # Remove trailing "Graphics" if it exists from old names - if game_name.endswith("Graphics"): - game_name = game_name[:-len("Graphics")].strip() + # Remove trailing "Graphics" if it exists (exact match at end) + if game_name.endswith("Graphics"): + game_name = game_name[:-len("Graphics")].strip() - # Apply proper title casing - game_name = title_case_preserve_numbers(game_name) + # Title-case with acronyms, roman-numeral, and compound-numeral logic + game_name = title_case_preserve_numbers(game_name) - mod_name = "Graphics Mods" - target_dir = os.path.join(formatted_path, f"{game_name} - {mod_name}") - os.makedirs(target_dir, exist_ok=True) + mod_name = "Graphics Mods" + target_dir = os.path.join(formatted_path, f"{game_name} - {mod_name}") + os.makedirs(target_dir, exist_ok=True) - source_path = os.path.join(root, file) - dest_path = os.path.join(target_dir, f"{version}.pchtxt") + source_path = os.path.join(current_root, file) + dest_path = os.path.join(target_dir, f"{version}.pchtxt") - shutil.copy2(source_path, dest_path) - print(f"Copied {source_path} → {dest_path}") + shutil.copy2(source_path, dest_path) + print(f"Copied {source_path} → {dest_path}") -def main(folder_path): - create_formatted_structure(folder_path) - print("Done!") + print("\nDone!") -if __name__ == "__main__": +def main(): if len(sys.argv) != 2: print("Usage: python collect_graphics_mods.py /path/to/root/folder") sys.exit(1) folder_path = sys.argv[1] - main(folder_path) + create_formatted_structure(folder_path) + +if __name__ == "__main__": + main()