Update format_repo5.py

2025-05-31 06:39:55 -07:00
parent c595af9e62
commit f005f477ae
1 changed files with 152 additions and 34 deletions
--- a/scripts/cucholix/format_repo5.py
+++ b/scripts/cucholix/format_repo5.py
@@ -1,73 +1,191 @@
+#!/usr/bin/env python3
 import os
 import shutil
 import unicodedata
 import sys
+import re

 def sanitize_name(name):
-    # Remove accents and unwanted characters
+    """
+    Remove accents and unwanted characters, and replace ' - ' with a single space.
+    """
    normalized = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
    cleaned = normalized.replace("'", "").replace("’", "").replace("`", "").replace('"', '')
    cleaned = cleaned.replace(" - ", " ")  # Remove " - " to avoid duplication
-    cleaned = ' '.join(cleaned.split())  # Collapse multiple spaces
+    cleaned = ' '.join(cleaned.split())    # Collapse multiple spaces
    return cleaned.strip()

+def capitalize_hyphenated(word):
+    """
+    Capitalize both parts of a hyphenated word. E.g. "yooka-laylee" → "Yooka-Laylee".
+    """
+    parts = word.split('-')
+    capitalized_parts = []
+    for part in parts:
+        if part:
+            capitalized_parts.append(part[0].upper() + part[1:].lower() if len(part) > 1 else part.upper())
+        else:
+            capitalized_parts.append('')
+    return '-'.join(capitalized_parts)
+
+# Regex for Roman numerals (supports up to 3999): I, II, III, IV, V, VI, VII, VIII, IX, X, XI, etc.
+ROMAN_NUMERAL_PATTERN = re.compile(
+    r"^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",
+    re.IGNORECASE
+)
+
+# A set of known acronyms (fully uppercase) that should remain uppercase.
+ACRONYMS = {
+    "HD", "2D", "3D", "4K", "VR", "AI", "API", "USB", "CPU", "GPU", "DVD", "CD",
+    "RPG", "FPS", "MMO", "MMORPG", "LAN", "GUI", "NPC", "FFVII", "FFVIII", "FX", "FFIX", "FFX", "FFXII"
+}
+
+def is_roman_numeral(word):
+    """
+    Return True if the word is a valid Roman numeral (case-insensitive).
+    """
+    return bool(ROMAN_NUMERAL_PATTERN.match(word))
+
 def title_case_preserve_numbers(name):
-    # Capitalize title correctly with exceptions for filler words in the middle
+    """
+    Title-case with these rules:
+      • Fully uppercase acronyms remain unchanged (e.g. HD, 2D, 3D, FFVII, etc.).
+      • Roman numerals become fully uppercase (e.g. iIi → III, xI → XI).
+      • Hyphenated words are capitalized on both sides (→ Yooka-Laylee).
+      • Conjoined Roman numerals with '&', '+', or '|' become fully uppercase (e.g. I&ii → I&II).
+      • Small filler words (a, an, and, the, of, in, etc.) become lowercase
+        only if they appear in the middle and are not immediately after a subtitle marker,
+        except the first and last words (which always capitalized).
+      • After a subtitle marker (":", "~", "–", "—", or "-"), force capitalization
+        on all subsequent words (until the next subtitle marker or end).
+    """
    lowercase_exceptions = {
-        "a", "an", "and", "as", "at", "but", "by", "for", "from", "in", "nor",
-        "of", "on", "or", "so", "the", "to", "yet", "with"
+        "a", "an", "and", "as", "at", "but", "by", "for", "from",
+        "in", "nor", "of", "on", "or", "so", "the", "to", "with", "yet"
    }
+    subtitle_markers = {":", "~", "-", "–", "—"}

    words = name.split()
    result = []
+    force_capitalize_mode = False  # Once True, stays True until next subtitle marker

-    for i, word in enumerate(words):
-        if (0 < i < len(words) - 1) and word.lower() in lowercase_exceptions:
-            result.append(word.lower())
-        else:
-            result.append(word.capitalize())
+    for idx, word in enumerate(words):
+        # Detect if this word contains any subtitle marker character
+        contains_marker = any(marker in word for marker in subtitle_markers)
+
+        # Split on subtitle markers but keep them in the list
+        split_parts = re.split(r'([:~\-–—])', word)
+        capitalized_parts = []
+
+        for part in split_parts:
+            if part in subtitle_markers:
+                # Append the marker itself, then force-capitalize subsequent parts
+                capitalized_parts.append(part)
+                force_capitalize_mode = True
+                continue
+
+            lower_part = part.lower()
+            is_first = (idx == 0)
+            is_last = (idx == len(words) - 1)
+
+            # Helper: capitalize a sub-word with special rules for acronyms, roman numerals, and conjoined numerals
+            def capitalize_special(w):
+                # If w (case-insensitive) is in our ACRONYMS set, uppercase it fully.
+                if w.upper() in ACRONYMS:
+                    return w.upper()
+                # If w alone is a Roman numeral, uppercase it fully.
+                if is_roman_numeral(w):
+                    return w.upper()
+                # Handle compound Roman numerals separated by &, +, or |
+                for sep in ['&', '+', '|']:
+                    if sep in w:
+                        parts = w.split(sep)
+                        if all(is_roman_numeral(p) for p in parts):
+                            return sep.join(p.upper() for p in parts)
+                # Otherwise, capitalize hyphenated words normally.
+                return capitalize_hyphenated(w)
+
+            # Decide how to capitalize this segment:
+            if force_capitalize_mode or is_first or is_last or (lower_part not in lowercase_exceptions):
+                # Split any hyphens, apply capitalize_special to each half
+                sub_parts = part.split('-')
+                capitalized_sub = [capitalize_special(sp) for sp in sub_parts]
+                capitalized_parts.append('-'.join(capitalized_sub))
+            else:
+                # In-the-middle filler word: keep lowercase
+                capitalized_parts.append(lower_part)
+
+        result.append(''.join(capitalized_parts))
+
+        # If this word did not contain a subtitle marker, stop forcing next capitalization
+        if not contains_marker:
+            force_capitalize_mode = False
+
+    # Always capitalize the FIRST and LAST words (using the same special rules):
+    if result:
+        first_parts = result[0].split('-')
+        result[0] = '-'.join(
+            sp.upper() if (sp.upper() in ACRONYMS or is_roman_numeral(sp)) else capitalize_hyphenated(sp)
+            for sp in first_parts
+        )
+
+        last_parts = result[-1].split('-')
+        result[-1] = '-'.join(
+            sp.upper() if (sp.upper() in ACRONYMS or is_roman_numeral(sp)) else capitalize_hyphenated(sp)
+            for sp in last_parts
+        )

    return ' '.join(result)

 def create_formatted_structure(root_folder):
+    """
+    Walk root_folder for all .pchtxt files. For each one:
+      1. Extract folder name → raw game name.
+      2. sanitize_name() → remove weird characters.
+      3. Remove trailing "Graphics" if present.
+      4. title_case_preserve_numbers() to get final Game Name.
+      5. Create folder: formatted/<Game Name> - Graphics Mods/
+      6. Copy each .pchtxt into that folder as <version>.pchtxt.
+    """
    formatted_path = os.path.join(root_folder, 'formatted')
    os.makedirs(formatted_path, exist_ok=True)
-    print(f"Creating formatted structure at: {formatted_path}")
+    print(f"Creating formatted structure at: {formatted_path}\n")

-    for root, dirs, files in os.walk(root_folder):
+    for current_root, dirs, files in os.walk(root_folder):
        for file in files:
-            if file.endswith('.pchtxt'):
-                version = file.replace('.pchtxt', '').strip()
+            if not file.lower().endswith('.pchtxt'):
+                continue

-                # Extract game name from parent folder
-                game_name = os.path.basename(os.path.dirname(os.path.join(root, file)))
-                game_name = sanitize_name(game_name)
+            version = file[:-len('.pchtxt')].strip()
+            parent_dir = os.path.basename(current_root)
+            game_name = sanitize_name(parent_dir)

-                # Remove trailing "Graphics" if it exists from old names
-                if game_name.endswith("Graphics"):
-                    game_name = game_name[:-len("Graphics")].strip()
+            # Remove trailing "Graphics" if it exists (exact match at end)
+            if game_name.endswith("Graphics"):
+                game_name = game_name[:-len("Graphics")].strip()

-                # Apply proper title casing
-                game_name = title_case_preserve_numbers(game_name)
+            # Title-case with acronyms, roman-numeral, and compound-numeral logic
+            game_name = title_case_preserve_numbers(game_name)

-                mod_name = "Graphics Mods"
-                target_dir = os.path.join(formatted_path, f"{game_name} - {mod_name}")
-                os.makedirs(target_dir, exist_ok=True)
+            mod_name = "Graphics Mods"
+            target_dir = os.path.join(formatted_path, f"{game_name} - {mod_name}")
+            os.makedirs(target_dir, exist_ok=True)

-                source_path = os.path.join(root, file)
-                dest_path = os.path.join(target_dir, f"{version}.pchtxt")
+            source_path = os.path.join(current_root, file)
+            dest_path = os.path.join(target_dir, f"{version}.pchtxt")

-                shutil.copy2(source_path, dest_path)
-                print(f"Copied {source_path} → {dest_path}")
+            shutil.copy2(source_path, dest_path)
+            print(f"Copied {source_path} → {dest_path}")

-def main(folder_path):
-    create_formatted_structure(folder_path)
-    print("Done!")
+    print("\nDone!")

-if __name__ == "__main__":
+def main():
    if len(sys.argv) != 2:
        print("Usage: python collect_graphics_mods.py /path/to/root/folder")
        sys.exit(1)

    folder_path = sys.argv[1]
-    main(folder_path)
+    create_formatted_structure(folder_path)
+
+if __name__ == "__main__":
+    main()