Files
Alchemist/scripts/cucholix/format_contents_repo.py

248 lines
9.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
import os
import re
import sys
import shutil
import tempfile
import subprocess
import unicodedata
def sanitize_name(name):
"""Normalize game names: remove symbols, accents, and replace ' - ' with space."""
n = unicodedata.normalize('NFKD', name).encode('ascii','ignore').decode('ascii')
n = n.replace("'", "").replace("", "").replace("`", "").replace('"', "")
n = n.replace(" - ", " ") # Replace problematic ' - ' only in the game name
return ' '.join(n.split()).strip()
def capitalize_hyphenated(word):
"""
Capitalize both parts of a hyphenated word. E.g. "yooka-laylee""Yooka-Laylee".
"""
parts = word.split('-')
capitalized = []
for part in parts:
if part:
capitalized.append(
part[0].upper() + part[1:].lower() if len(part) > 1 else part.upper()
)
else:
capitalized.append('')
return '-'.join(capitalized)
# Regex for Roman numerals (supports up to 3999): I, II, III, IV, V, etc.
ROMAN_NUMERAL_PATTERN = re.compile(
r"^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",
re.IGNORECASE
)
# A set of known acronyms that should remain uppercase exactly as is.
ACRONYMS = {
"HD", "2D", "3D", "4K", "VR", "AI", "API", "USB", "CPU", "GPU", "DVD", "CD",
"RPG", "FPS", "MMO", "MMORPG", "LAN", "GUI", "NPC",
"FFVII", "FFVIII", "FFIX", "FFX", "FFXII",
"FX", "2K", "5K", "8K", "V1", "V2", "V3", "V4", "DOF"
}
def is_roman_numeral(word):
"""Return True if the word is a valid Roman numeral (e.g. I, ii, Xx, etc.)."""
return bool(ROMAN_NUMERAL_PATTERN.match(word))
def title_case_preserve_numbers(name):
"""
Title-case with these rules:
• Fully uppercase acronyms remain unchanged.
• Roman numerals are fully uppercase.
• Hyphenated words are capitalized on both sides.
• Conjoined Roman numerals with '&', '+', or '|' become fully uppercase.
• Small filler words (a, an, and, the, of, in, etc.) become lowercase
only if they are in the middle of the title (not first, not last, not
immediately after a subtitle marker).
• After a subtitle marker ( :, ~, , —, - ), force capitalization on all
subsequent words until the next subtitle marker or end-of-title.
"""
lowercase_exceptions = {
"a", "an", "and", "as", "at", "but", "by", "for", "from",
"in", "nor", "of", "on", "or", "so", "the", "to", "with", "yet"
}
subtitle_markers = {":", "~", "-", "", ""}
words = name.split()
result = []
force_capitalize_mode = False # Once True, it stays True until next subtitle marker
for idx, word in enumerate(words):
# Detect if this word contains any subtitle marker character
contains_marker = any(marker in word for marker in subtitle_markers)
# Split on subtitle markers but keep them in the list
split_parts = re.split(r'([:~\-–—])', word)
capitalized_parts = []
for part in split_parts:
if part in subtitle_markers:
# Keep the subtitle marker, then force subsequent parts to capitalize
capitalized_parts.append(part)
force_capitalize_mode = True
continue
lower_part = part.lower()
is_first = (idx == 0)
is_last = (idx == len(words) - 1)
# Helper: capitalize a sub-word with special rules
def capitalize_special(w):
if w.upper() in ACRONYMS:
return w.upper()
if is_roman_numeral(w):
return w.upper()
for sep in ['&', '+', '|']:
if sep in w:
sub = w.split(sep)
if all(is_roman_numeral(x) for x in sub):
return sep.join(x.upper() for x in sub)
# Otherwise, capitalize hyphenated words normally
return capitalize_hyphenated(w)
# Decide how to capitalize this segment:
if (
force_capitalize_mode or
is_first or
is_last or
(lower_part not in lowercase_exceptions)
):
sub_parts = part.split('-')
capitalized_sub_parts = [capitalize_special(sp) for sp in sub_parts]
capitalized_parts.append('-'.join(capitalized_sub_parts))
else:
capitalized_parts.append(lower_part)
result.append(''.join(capitalized_parts))
# If this word did not contain a subtitle marker, stop forcing next capitalization
if not contains_marker:
force_capitalize_mode = False
# Always re-capitalize FIRST and LAST words fully using the same special rules
if result:
first_word_parts = result[0].split('-')
result[0] = '-'.join(
sp.upper() if (sp.upper() in ACRONYMS or is_roman_numeral(sp)) else capitalize_hyphenated(sp)
for sp in first_word_parts
)
last_word_parts = result[-1].split('-')
result[-1] = '-'.join(
sp.upper() if (sp.upper() in ACRONYMS or is_roman_numeral(sp)) else capitalize_hyphenated(sp)
for sp in last_word_parts
)
return ' '.join(result)
def find_title_id(path):
"""Look for a 16-character Title ID under any 'contents/' directory in path."""
for root, dirs, _ in os.walk(path):
if os.path.basename(root).lower() == 'contents':
for d in dirs:
if re.fullmatch(r'[0-9A-Fa-f]{16}', d):
return d
return None
def extract_with_7z(rar_path, tmpdir):
"""
Extract only atmosphere/contents/* using 7z.
Return True if extraction succeeded (7z return code 0 OR
we already found a Title ID folder inside tmpdir).
"""
cmd = [
"7z", "x", "-y",
rar_path,
f"-o{tmpdir}",
"atmosphere/contents/*"
]
proc = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return (proc.returncode == 0) or (find_title_id(tmpdir) is not None)
def process_rar(root_folder, rar_relpath, output_root):
"""
1) Parse “release_<version>.rar” or “release_<version>.part01.rar” → version string.
2) Sanitize + title-case the game folder name.
3) Extract with 7z into a temp dir.
4) Find Title ID under atmosphere/contents/<16hex>/.
5) Copy that content into output/<GameName>/version/<TitleID>/.
"""
subdir, filename = os.path.split(rar_relpath)
# New regex: capture base version, ignoring any “.partXX”
version_match = re.match(r"release_(.+?)(?:\.part\d+)?\.rar$", filename, re.IGNORECASE)
if not version_match:
print(f"❌ Invalid release name: {filename}")
return
version = version_match.group(1) # e.g. “1.2.4” even if filename was “release_1.2.4.part01.rar”
raw_game_name = os.path.basename(subdir)
# 2) Clean & normalize game name
cleaned_name = sanitize_name(raw_game_name)
game_name = title_case_preserve_numbers(cleaned_name)
pack_label = f"{game_name} - Graphics Pack"
rar_path = os.path.join(root_folder, rar_relpath)
with tempfile.TemporaryDirectory() as tmp:
ok = extract_with_7z(rar_path, tmp)
if not ok:
print(f"❌ Extraction error (7z) on {rar_relpath}")
return
title_id = find_title_id(tmp)
if not title_id:
print(f"❌ No Title ID found in {rar_relpath}")
return
version_dir = os.path.join(output_root, pack_label, version)
os.makedirs(version_dir, exist_ok=True)
src = os.path.join(tmp, "atmosphere", "contents", title_id)
dst = os.path.join(version_dir, title_id)
try:
shutil.copytree(src, dst, dirs_exist_ok=True)
print(f"{rar_relpath}{os.path.join(pack_label, version, title_id)}")
except Exception as e:
print(f"❌ Copy failed for {rar_relpath}: {e}")
def main():
if len(sys.argv) != 2:
print("Usage: python collect_content_mods.py /path/to/root")
sys.exit(1)
root = sys.argv[1]
output = os.path.join(root, "format2")
os.makedirs(output, exist_ok=True)
tasks = []
for dirpath, _, files in os.walk(root):
for fn in files:
# 1) Only consider RARs that are either:
# • single-part: “release_<version>.rar”
# • first part of multi-part: “release_<version>.part01.rar”
#
# Regex explanation:
# release_ → literal prefix
# (.+?) → capture “version” (non-greedy)
# (?:\.part\d+)? → optionally “.partNN” (where NN = digits)
# \.rar$ → end with “.rar”
m = re.match(r"release_(.+?)(?:\.part\d+)?\.rar$", fn, re.IGNORECASE)
if not m:
continue
# If it really is a “.partNN.rar” (some NN > 1), skip it.
# We only want “.part01” or no “.part” at all.
if fn.lower().endswith(".part01.rar") or fn.lower().endswith(".rar") and ".part" not in fn.lower():
rel = os.path.relpath(os.path.join(dirpath, fn), root)
tasks.append(rel)
for relpath in tasks:
process_rar(root, relpath, output)
print("✅ Done.")
if __name__ == "__main__":
main()