📦 Stirling-Tools / Stirling-PDF

📄 bulk_auto_translate.py · 360 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360#!/usr/bin/env python3
"""
Bulk Auto-Translate All Languages
Automatically translates all languages in parallel using OpenAI API.
Supports concurrent translation with configurable thread pool.
"""

import argparse
import os
import sys
import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import subprocess
from typing import List, Tuple, Optional
import threading

import tomllib


# Thread-safe print lock
print_lock = threading.Lock()


def safe_print(*args, **kwargs):
    """Thread-safe print function."""
    with print_lock:
        print(*args, **kwargs)


def get_all_languages(locales_dir: Path) -> List[str]:
    """Get all language codes from locales directory."""
    languages = []

    if not locales_dir.exists():
        print(f"Error: Locales directory not found: {locales_dir}")
        return []

    for lang_dir in sorted(locales_dir.iterdir()):
        if lang_dir.is_dir() and lang_dir.name != "en-GB":
            toml_file = lang_dir / "translation.toml"
            if toml_file.exists():
                languages.append(lang_dir.name)

    return languages


def get_language_completion(locales_dir: Path, language: str) -> Optional[float]:
    """Get completion percentage for a language."""
    lang_dir = locales_dir / language
    toml_file = lang_dir / "translation.toml"

    if not toml_file.exists():
        return None

    try:
        with open(toml_file, "rb") as f:
            target_data = tomllib.load(f)

        # Load en-GB reference
        en_gb_file = locales_dir / "en-GB" / "translation.toml"
        with open(en_gb_file, "rb") as f:
            en_gb_data = tomllib.load(f)

        # Flatten and count
        def flatten(d, parent=""):
            items = {}
            for k, v in d.items():
                key = f"{parent}.{k}" if parent else k
                if isinstance(v, dict):
                    items.update(flatten(v, key))
                else:
                    items[key] = v
            return items

        en_gb_flat = flatten(en_gb_data)
        target_flat = flatten(target_data)

        # Count translated (not equal to en-GB)
        translated = sum(
            1
            for k in en_gb_flat
            if k in target_flat and target_flat[k] != en_gb_flat[k]
        )
        total = len(en_gb_flat)

        return (translated / total * 100) if total > 0 else 0.0

    except Exception as e:
        print(f"Warning: Could not calculate completion for {language}: {e}")
        return None


def translate_language(
    language: str,
    api_key: str,
    batch_size: int,
    timeout: int,
    skip_verification: bool,
    include_existing: bool,
) -> Tuple[str, bool, str]:
    """
    Translate a single language.
    Returns: (language_code, success, message)
    """
    safe_print(f"[{language}] Starting translation...")

    cmd = [
        "python3",
        "scripts/translations/auto_translate.py",
        language,
        "--api-key",
        api_key,
        "--batch-size",
        str(batch_size),
        "--timeout",
        str(timeout),
    ]

    if skip_verification:
        cmd.append("--skip-verification")

    if include_existing:
        cmd.append("--include-existing")

    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=timeout * 5,  # Overall timeout = 5x per-batch timeout
        )

        if result.returncode == 0:
            # Check if nothing to translate
            if "Nothing to translate!" in result.stdout:
                safe_print(f"[{language}] ✓ Already complete")
                return (language, True, "Already complete")
            safe_print(f"[{language}] ✓ Success")
            return (language, True, "Success")
        else:
            error_msg = (
                result.stderr.strip() or result.stdout.strip() or "Unknown error"
            )
            safe_print(f"[{language}] ✗ Failed: {error_msg[:100]}")
            return (language, False, error_msg[:200])  # Truncate long errors

    except subprocess.TimeoutExpired:
        safe_print(f"[{language}] ✗ Timeout exceeded")
        return (language, False, "Timeout exceeded")
    except Exception as e:
        safe_print(f"[{language}] ✗ Error: {str(e)}")
        return (language, False, str(e))


def main():
    parser = argparse.ArgumentParser(
        description="Bulk auto-translate all languages using OpenAI API",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Translate all languages with 10 parallel threads
  python3 bulk_auto_translate.py --parallel 10

  # Translate only incomplete languages (< 95%)
  python3 bulk_auto_translate.py --parallel 5 --threshold 95

  # Translate specific languages only
  python3 bulk_auto_translate.py --languages de-DE fr-FR es-ES --parallel 3

  # Dry run to see what would be translated
  python3 bulk_auto_translate.py --dry-run

Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
""",
    )

    parser.add_argument(
        "--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)"
    )
    parser.add_argument(
        "--parallel",
        type=int,
        default=1,
        help="Number of parallel translation threads (default: 1)",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=500,
        help="Entries per batch for translation (default: 500)",
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=600,
        help="Timeout per batch in seconds (default: 600)",
    )
    parser.add_argument(
        "--threshold",
        type=float,
        default=0.0,
        help="Only translate languages below this completion %% (default: 0 = all)",
    )
    parser.add_argument(
        "--languages",
        nargs="+",
        help="Translate only specific languages (e.g., de-DE fr-FR)",
    )
    parser.add_argument(
        "--locales-dir",
        default="frontend/public/locales",
        help="Path to locales directory",
    )
    parser.add_argument(
        "--skip-verification",
        action="store_true",
        help="Skip final completion verification for each language",
    )
    parser.add_argument(
        "--include-existing",
        action="store_true",
        help="Also retranslate existing keys that match English (default: only translate missing keys)",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what would be translated without actually translating",
    )

    args = parser.parse_args()

    # Verify API key (unless dry run)
    api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
    if not args.dry_run and not api_key:
        print(
            "Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable"
        )
        sys.exit(1)

    locales_dir = Path(args.locales_dir)

    # Get languages to translate
    if args.languages:
        languages = args.languages
        print(f"Translating specified languages: {', '.join(languages)}")
    else:
        languages = get_all_languages(locales_dir)
        print(f"Found {len(languages)} languages (excluding en-GB)")

    if not languages:
        print("No languages to translate!")
        sys.exit(0)

    # Filter by completion threshold
    if args.threshold > 0:
        print(f"\nFiltering languages below {args.threshold}% completion...")
        filtered = []
        for lang in languages:
            completion = get_language_completion(locales_dir, lang)
            if completion is None:
                filtered.append(lang)  # Include if can't determine
                print(f"  {lang}: Unknown completion - will translate")
            elif completion < args.threshold:
                filtered.append(lang)
                print(f"  {lang}: {completion:.1f}% - will translate")
            else:
                print(f"  {lang}: {completion:.1f}% - skipping (above threshold)")

        languages = filtered

        if not languages:
            print("\nNo languages below threshold!")
            sys.exit(0)

    print(f"\n{'=' * 60}")
    print("Bulk Translation Configuration")
    print(f"{'=' * 60}")
    print(f"Languages to translate: {len(languages)}")
    print(f"Parallel threads: {args.parallel}")
    print(f"Batch size: {args.batch_size}")
    print(f"Timeout per batch: {args.timeout}s")
    if args.threshold > 0:
        print(f"Completion threshold: {args.threshold}%")
    print(f"{'=' * 60}\n")

    if args.dry_run:
        print("DRY RUN - Languages that would be translated:")
        for lang in languages:
            completion = get_language_completion(locales_dir, lang)
            comp_str = f"{completion:.1f}%" if completion is not None else "Unknown"
            print(f"  - {lang} ({comp_str})")
        print(f"\nTotal: {len(languages)} languages")
        sys.exit(0)

    start_time = time.time()

    # Translate in parallel
    results = {"success": [], "failed": [], "already_complete": []}

    with ThreadPoolExecutor(max_workers=args.parallel) as executor:
        futures = {
            executor.submit(
                translate_language,
                lang,
                api_key,
                args.batch_size,
                args.timeout,
                args.skip_verification,
                args.include_existing,
            ): lang
            for lang in languages
        }

        for future in as_completed(futures):
            language, success, message = future.result()

            if success:
                if message == "Already complete":
                    results["already_complete"].append(language)
                else:
                    results["success"].append(language)
            else:
                results["failed"].append((language, message))

    elapsed = time.time() - start_time

    # Print summary
    print("\n" + "=" * 60)
    print("Bulk Translation Summary")
    print("=" * 60)
    print(f"Total languages: {len(languages)}")
    print(f"Successful: {len(results['success'])}")
    print(f"Already complete: {len(results['already_complete'])}")
    print(f"Failed: {len(results['failed'])}")
    print(f"Time elapsed: {elapsed:.1f} seconds ({elapsed / 60:.1f} minutes)")
    print("=" * 60)

    if results["success"]:
        print(f"\n✅ Successfully translated ({len(results['success'])}):")
        for lang in sorted(results["success"]):
            print(f"  - {lang}")

    if results["already_complete"]:
        print(f"\n✓ Already complete ({len(results['already_complete'])}):")
        for lang in sorted(results["already_complete"]):
            print(f"  - {lang}")

    if results["failed"]:
        print(f"\n❌ Failed ({len(results['failed'])}):")
        for lang, msg in sorted(results["failed"]):
            print(f"  - {lang}: {msg}")
        sys.exit(1)

    print("\n✅ Bulk translation completed successfully!")


if __name__ == "__main__":
    main()