brush-paint-opt: density-aware unpainted + lex comparator + meta-optimizer

Adds per-letter unpainted-cluster CC analysis and uses cluster-size^1.5 in the score function (so 30 px in one cluster reads worse than the same 30 px scattered as edge slop). Hard barrier in score_for_letter now fires on max-cluster > 0.5×brush_area instead of total unpainted%. CorpusReport + compare_reports give the outer optimizer a hard lexicographic ranking: tier-1 = letters failing each hard criterion, tier-2 = corpus aggregates (bg, strokes, density, repaint, length). run_meta_opt samples random ScoreWeights, runs the inner optimizer under each, ranks results lex-wise. paint_meta_optimize test reads META_N_OUTER / META_N_INNER / META_PASSES env vars. Distributed orchestrator updated to use bash -lc on remote so non- login SSH shells pick up cargo from ~/.cargo/bin. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 00:22:37 -07:00
parent d461d2d20e
commit 9748e06ca9
3 changed files with 508 additions and 36 deletions
--- a/scripts/optimize_distributed.sh
+++ b/scripts/optimize_distributed.sh
@@ -43,13 +43,16 @@ echo "[orch] split: $LOCAL_N local, $REMOTE_N remote" >&2
 echo "[orch] cargo build --release --bin paint_opt_worker (local)…" >&2
 ( cd "$ROOT" && cargo build --release --bin paint_opt_worker ) >&2
-# Build remote binary in parallel if a remote is configured.
+# Build remote binary in parallel if a remote is configured. We invoke
 # remote commands via `bash -lc` so the login shell sources whatever
 # rc files put cargo on PATH (~/.cargo/bin); without that, ssh's
 # non-login shell may fail with "cargo: command not found".
 REMOTE_BUILD_PID=""
 if [[ -n "$REMOTE" ]]; then
    HOST="${REMOTE%%:*}"
    RPATH="${REMOTE#*:}"
    echo "[orch] cargo build --release on remote ($HOST:$RPATH)…" >&2
-    ( ssh "$HOST" "cd '$RPATH' && cargo build --release --bin paint_opt_worker" >&2 ) &
+    ( ssh "$HOST" "bash -lc 'cd \"$RPATH\" && cargo build --release --bin paint_opt_worker'" >&2 ) &
    REMOTE_BUILD_PID=$!
 fi
@@ -83,7 +86,7 @@ if [[ -n "$REMOTE" && "$REMOTE_N" -gt 0 ]]; then
    echo "[orch] launching $REMOTE_N remote workers on $HOST…" >&2
    # Generate the index list local-side and stream to xargs over ssh.
    seq "$LOCAL_N" $((N - 1)) | \
-        ssh "$HOST" "cd '$RPATH' && xargs -n1 -P$REMOTE_N -I{} ./target/release/paint_opt_worker {}" \
+        ssh "$HOST" "bash -lc 'cd \"$RPATH\" && xargs -n1 -P$REMOTE_N -I{} ./target/release/paint_opt_worker {}'" \
        >> "$REMOTE_OUT" &
    REMOTE_PID=$!
 fi
--- a/src/brush_paint.rs
+++ b/src/brush_paint.rs
@@ -205,6 +205,11 @@ pub struct PaintDebug {
    /// near this; well under 1.5× this means efficient, well over means
    /// the path is snaking.
    pub skeleton_length: u32,
    /// Sizes of every connected component in the *unpainted* ink mask
    /// at the end of painting. Lets the score function tell scattered
    /// edge slop ("ten 1-px clusters") apart from feature-sized misses
    /// ("one 30-px crossbar tip cluster").
    pub unpainted_clusters: Vec<u32>,
    /// Raw trajectories (one per stroke), pre-smoothing.
    pub trajectories:  Vec<Vec<(f32, f32)>>,
    /// Final smoothed strokes (what would go to gcode).
@@ -517,6 +522,44 @@ impl Grid {
    }
    /// True iff `(x, y)` is an originally-ink pixel.
    /// 4-connected connected-component analysis on the *currently
    /// unpainted* ink mask. Returns one size per CC, in pixels.
    /// Used for density-aware coverage scoring: a single 30-px cluster
    /// (a missed crossbar tip) is recognisable as missing whereas the
    /// same 30 pixels split into 30 single-pixel scattered slop is just
    /// brush-edge noise.
    fn unpainted_cluster_sizes(&self) -> Vec<u32> {
        let n = self.unpainted.len();
        let mut comp_id = vec![-1i32; n];
        let mut sizes: Vec<u32> = Vec::new();
        for sy in 0..self.height {
            for sx in 0..self.width {
                let s_idx = (sy * self.width + sx) as usize;
                if !self.unpainted[s_idx] || comp_id[s_idx] >= 0 { continue; }
                let id = sizes.len() as i32;
                let mut size = 0u32;
                let mut stack: Vec<(i32, i32)> = vec![(sx, sy)];
                while let Some((cx, cy)) = stack.pop() {
                    let cidx = (cy * self.width + cx) as usize;
                    if comp_id[cidx] >= 0 { continue; }
                    if !self.unpainted[cidx] { continue; }
                    comp_id[cidx] = id;
                    size += 1;
                    for (dx, dy) in [(1, 0i32), (-1, 0), (0, 1), (0, -1)] {
                        let nx = cx + dx; let ny = cy + dy;
                        if nx < 0 || ny < 0 || nx >= self.width || ny >= self.height { continue; }
                        let nidx = (ny * self.width + nx) as usize;
                        if self.unpainted[nidx] && comp_id[nidx] < 0 {
                            stack.push((nx, ny));
                        }
                    }
                }
                sizes.push(size);
            }
        }
        sizes
    }
    fn is_ink(&self, x: i32, y: i32) -> bool {
        let lx = x - self.bx; let ly = y - self.by;
        if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return false; }
@@ -1381,7 +1424,7 @@ pub fn paint_fill_with(hull: &Hull, params: &PaintParams) -> FillResult {
 /// Quantitative summary of one fill result. Computed cheaply from a
 /// `FillResult` plus the source hull (the hull is needed to count
 /// background paint, since FillResult only has stroke geometry).
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone)]
 pub struct PaintMetrics {
    /// Number of strokes (= pen lifts + 1, or 0 if no strokes).
    pub strokes:        u32,
@@ -1405,6 +1448,11 @@ pub struct PaintMetrics {
    /// "ideal" path length budget — `total_length` should sit close to
    /// this for efficient single-pass tracing.
    pub skeleton_length: u32,
    /// Sizes of unpainted-ink connected components after the algorithm
    /// finishes. Density signal: one 30-px cluster (a real missing
    /// feature) reads worse than thirty 1-px scattered slop pixels even
    /// though both have the same total unpainted count.
    pub unpainted_clusters: Vec<u32>,
    /// Sum of absolute angle changes between consecutive segments along
    /// every stroke, in radians. Smooth handwriting has small total
    /// curvature; jagged zigzag accumulates lots.
@@ -1445,16 +1493,17 @@ pub fn metrics_for(hull: &Hull, params: &PaintParams) -> (FillResult, PaintMetri
        c
    }).sum();
    let m = PaintMetrics {
-        strokes:         strokes.len() as u32,
+        strokes:            strokes.len() as u32,
        total_length,
-        bg_painted:      dbg.bg_painted,
+        bg_painted:         dbg.bg_painted,
-        total_swept:     dbg.total_swept,
+        total_swept:        dbg.total_swept,
-        repaint:         dbg.repaint,
+        repaint:            dbg.repaint,
-        ink_total:       dbg.ink_total,
+        ink_total:          dbg.ink_total,
-        ink_unpainted:   dbg.ink_unpainted,
+        ink_unpainted:      dbg.ink_unpainted,
-        skeleton_length: dbg.skeleton_length,
+        skeleton_length:    dbg.skeleton_length,
        unpainted_clusters: dbg.unpainted_clusters.clone(),
        curvature,
-        brush_radius:    dbg.brush_radius,
+        brush_radius:       dbg.brush_radius,
    };
    (FillResult { hull_id: hull.id, strokes }, m)
 }
@@ -1465,10 +1514,20 @@ pub fn metrics_for(hull: &Hull, params: &PaintParams) -> (FillResult, PaintMetri
 /// each one has a known single-stroke topology (possibly with double-back).
 pub const SINGLE_STROKE_LETTERS: &str = "CGIJLMNOSUVWZcejilosvwz";
 /// Letters whose natural human topology is *exactly two* pen strokes.
 /// Crosses (T/t/X/x/+) and Y-junctions where one continuous stroke
 /// would require an unnatural double-back across the cross. Constraint
 /// penalty applies when stroke count ≠ 2.
 pub const TWO_STROKE_LETTERS: &str = "TtXxKkYyFfHh";
 pub fn is_single_stroke_letter(ch: char) -> bool {
    SINGLE_STROKE_LETTERS.contains(ch)
 }
 pub fn is_two_stroke_letter(ch: char) -> bool {
    TWO_STROKE_LETTERS.contains(ch)
 }
 /// Letter-aware score: applies the default score plus hard constraint
 /// failures. A config that trips ANY hard ceiling returns f32::MAX so
 /// the optimizer rejects it outright. Soft knobs (brush_size bonus,
@@ -1503,11 +1562,14 @@ pub fn score_for_letter(ch: char, m: &PaintMetrics) -> f32 {
            s += 100_000_000.0 * (bg_rate - 0.05);
        }
    }
-    if m.ink_total > 0 {
+    // Density-aware unpainted barrier. A cluster bigger than half the
-        let unpainted_rate = m.ink_unpainted as f32 / m.ink_total as f32;
+    // brush footprint = a recognisable feature is missing (a crossbar
-        if unpainted_rate > 0.05 {
+    // tip, half a loop, etc.). Scattered single-pixel slop never trips
-            s += 100_000_000.0 * (unpainted_rate - 0.05);
+    // this; one 30-px cluster does. Threshold scales with brush.
-        }
+    let cluster_threshold = 0.5 * std::f32::consts::PI * m.brush_radius * m.brush_radius;
    let max_cluster = m.unpainted_clusters.iter().copied().max().unwrap_or(0) as f32;
    if max_cluster > cluster_threshold {
        s += 1_000_000.0 * (max_cluster - cluster_threshold);
    }
    if m.skeleton_length > 0 && m.total_length > 2.0 * m.skeleton_length as f32 {
        // Length budget: 100k/px above 2× skel. For a 300-px-skeleton
@@ -1522,6 +1584,10 @@ pub fn score_for_letter(ch: char, m: &PaintMetrics) -> f32 {
        let delta = (m.strokes as i64 - 1).abs() as f32;
        s += 50_000.0 * delta;
    }
    if is_two_stroke_letter(ch) && m.strokes != 2 {
        let delta = (m.strokes as i64 - 2).abs() as f32;
        s += 50_000.0 * delta;
    }
    s
 }
@@ -1537,13 +1603,22 @@ pub fn default_score(m: &PaintMetrics) -> f32 {
    score_weighted(m, ScoreWeights::default())
 }
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
 pub struct ScoreWeights {
    pub stroke:    f32,
    pub length:    f32,
    pub bg:        f32,
    pub repaint:   f32,
    /// Linear unpainted-pixel cost. Cheap by design — the heavy lifting
    /// is done by `unpainted_density` which super-linearly weights
    /// large clusters. Linear stays for tie-breaking among configs that
    /// have similar density patterns.
    pub unpainted: f32,
    /// Density-aware unpainted cost. Penalty per letter is
    /// `weight × Σ over clusters of size^1.5`. A 30-px cluster (a
    /// recognisable missing feature) costs ~5× a 30-px scattered slop,
    /// matching how visible each is on the printed page.
    pub unpainted_density: f32,
    /// Per-pixel cost of stroke length above 1.5× the skeleton length
    /// (the "ideal" trace). 0 inside budget; ramps up sharply outside.
    pub length_excess: f32,
@@ -1571,14 +1646,15 @@ impl Default for ScoreWeights {
        // few unpainted pixels over a larger-radius solution that paints
        // 50× as many bg pixels.
        Self {
-            stroke:        500.0,
+            stroke:            500.0,
-            length:        5.0,
+            length:            5.0,
-            bg:            50.0,
+            bg:                50.0,
-            repaint:       30.0,
+            repaint:           30.0,
-            unpainted:     200.0,
+            unpainted:         50.0,    // mild — density carries the weight
-            length_excess: 300.0,
+            unpainted_density: 10.0,    // cluster_size^1.5 × 10
-            curvature:     500.0,
+            length_excess:     300.0,
-            brush_size:    2000.0,   // pressure toward bigger brush. Per
+            curvature:         500.0,
            brush_size:        2000.0,   // pressure toward bigger brush. Per
                                     // letter, +1 px brush = +2000 bonus;
                                     // vs bg=50/px that's "worth" up to
                                     // ~40 extra bg pixels per letter. So
@@ -1593,14 +1669,18 @@ impl Default for ScoreWeights {
 pub fn score_weighted(m: &PaintMetrics, w: ScoreWeights) -> f32 {
    let budget = 1.5 * m.skeleton_length as f32;
    let excess = (m.total_length - budget).max(0.0);
-    w.stroke        * m.strokes as f32
+    let density: f32 = m.unpainted_clusters.iter()
-  + w.length        * m.total_length
+        .map(|&n| (n as f32).powf(1.5))
-  + w.bg            * m.bg_painted as f32
+        .sum();
-  + w.repaint       * m.repaint as f32
+    w.stroke            * m.strokes as f32
-  + w.unpainted     * m.ink_unpainted as f32
+  + w.length            * m.total_length
-  + w.length_excess * excess
+  + w.bg                * m.bg_painted as f32
-  + w.curvature     * m.curvature
+  + w.repaint           * m.repaint as f32
-  - w.brush_size    * m.brush_radius
+  + w.unpainted         * m.ink_unpainted as f32
  + w.unpainted_density * density
  + w.length_excess     * excess
  + w.curvature         * m.curvature
  - w.brush_size        * m.brush_radius
 }
 /// Run `paint_fill_with` once per `params_variant` and return the best
@@ -1792,6 +1872,7 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
    let ink_unpainted = grid.ink_remaining.max(0) as u32;
    let (bg_painted, total_swept, repaint) = measure_sweep_full(&strokes, &grid, brush_radius);
    let skeleton_length = grid.skeleton_length;
    let unpainted_clusters = grid.unpainted_cluster_sizes();
    PaintDebug {
        bounds,
        source_b64:   crate::streamline::encode_hull_pixels_b64(hull),
@@ -1805,6 +1886,7 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
        total_swept,
        repaint,
        skeleton_length,
        unpainted_clusters,
        trajectories,
        strokes,
        start_points: starts,
@@ -2115,6 +2197,63 @@ mod tests {
    /// descent visits N × K candidates per pass (vs Kᴺ for the grid),
    /// and the parallel inner loop evaluates each candidate against
    /// the whole corpus in one go.
    /// Outer (meta) optimizer: searches ScoreWeights space and ranks
    /// each candidate by the lexicographic comparator (`compare_reports`)
    /// instead of a hand-tuned weighted sum. The ordering is hard:
    /// fewer letters with feature-sized unpainted clusters > fewer with
    /// >5% bg > fewer single-stroke-letter constraint violations >
    /// fewer two-stroke-letter constraint violations > fewer length
    /// over-budget > then aggregate totals as tiebreakers.
    #[test]
    #[ignore]
    fn paint_meta_optimize() {
        use crate::brush_paint_opt::{run_meta_opt, compare_reports};
        let base = PaintParams::default();
        // Smoke-test sizes — change for real runs. With these defaults
        // each meta-iteration is ~25-40s on an 8-core laptop; the full
        // 12×8×3 run takes ~40 min serial (use the SSH orchestrator if
        // you want it faster).
        let n_outer = std::env::var("META_N_OUTER").ok().and_then(|s| s.parse().ok()).unwrap_or(4);
        let n_inner_starts = std::env::var("META_N_INNER").ok().and_then(|s| s.parse().ok()).unwrap_or(4);
        let inner_passes   = std::env::var("META_PASSES").ok().and_then(|s| s.parse().ok()).unwrap_or(2);
        println!("\n[meta] {} outer × {} inner starts × {} passes",
            n_outer, n_inner_starts, inner_passes);
        let t0 = std::time::Instant::now();
        let results = run_meta_opt(n_outer, n_inner_starts, inner_passes, &base);
        let elapsed = t0.elapsed();
        println!("\n[meta] {} results in {:.1}s, lex-sorted best-first:",
            results.len(), elapsed.as_secs_f64());
        for (rank, r) in results.iter().enumerate() {
            println!("  #{:2} idx={:2} {}", rank+1, r.idx, r.report.summary());
        }
        let best = &results[0];
        let _ = compare_reports;  // silence unused warn
        println!("\n[meta] BEST WEIGHTS (idx={}):", best.idx);
        println!("  stroke           = {:.0}", best.weights.stroke);
        println!("  length           = {:.2}", best.weights.length);
        println!("  bg               = {:.1}", best.weights.bg);
        println!("  repaint          = {:.1}", best.weights.repaint);
        println!("  unpainted        = {:.1}", best.weights.unpainted);
        println!("  unpainted_density= {:.2}", best.weights.unpainted_density);
        println!("  length_excess    = {:.0}", best.weights.length_excess);
        println!("  curvature        = {:.0}", best.weights.curvature);
        println!("  brush_size       = {:.0}", best.weights.brush_size);
        println!("\n[meta] BEST PAINT PARAMS:");
        println!("  brush_radius_factor     = {:.2}", best.params.brush_radius_factor);
        println!("  brush_radius_offset_px  = {:.2}", best.params.brush_radius_offset_px);
        println!("  brush_radius_percentile = {:.2}", best.params.brush_radius_percentile);
        println!("  step_size_factor        = {:.2}", best.params.step_size_factor);
        println!("  walk_bg_penalty         = {:.2}", best.params.walk_bg_penalty);
        println!("  polish_iters            = {}",   best.params.polish_iters);
        println!("  polish_search_factor    = {:.2}", best.params.polish_search_factor);
        println!("  bg_penalty              = {:.2}", best.params.bg_penalty);
        println!("  pen_lift_penalty        = {:.1}", best.params.pen_lift_penalty);
        println!("  min_component_factor    = {:.2}", best.params.min_component_factor);
    }
    #[test]
    #[ignore]
    fn paint_optimize_global_defaults() {
--- a/src/brush_paint_opt.rs
+++ b/src/brush_paint_opt.rs
@@ -12,10 +12,13 @@
 //! Both share `default_axes` / `build_corpus` / `build_start_params` /
 //! `refine_one` so they search identical landscapes.
 use std::cmp::Ordering;
 use rayon::prelude::*;
 use serde::{Serialize, Deserialize};
 use crate::brush_paint::{
-    PaintParams, score_for_letter, metrics_for, rasterize_test_letter,
+    PaintParams, PaintMetrics, ScoreWeights,
    score_for_letter, metrics_for, rasterize_test_letter,
    is_single_stroke_letter, is_two_stroke_letter,
 };
 use crate::hulls::Hull;
@@ -77,7 +80,10 @@ pub const CORPUS_CASES: &[(f32, u32, u32)] = &[
    (5.0, 200, 4),
    (5.0, 425, 9),
 ];
-pub const CORPUS_ALPHABET: &str = "ACGIJLMNOSUVWXZabcdefijlmosuvwxz";
+// Includes all SINGLE_STROKE_LETTERS, all TWO_STROKE_LETTERS, plus the
 // remaining alphanumerics for breadth. ~52 letters × N scales is the
 // per-inner-eval cost.
 pub const CORPUS_ALPHABET: &str = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 pub fn build_corpus() -> Vec<(char, Hull)> {
    CORPUS_CASES.iter().flat_map(|&(mm, dpi, t)| {
@@ -236,3 +242,327 @@ pub fn run_one_start(start_idx: usize, base: &PaintParams, max_passes: u32) -> R
    let (params, score, log) = refine_one(&corpus, &axes, &start, max_passes);
    RefineResult { start_idx, score, params, log }
 }
 // ─── Lexicographic outer-ranking ────────────────────────────────────────
 //
 // The outer optimizer (`run_meta_opt`) needs a way to rank "the result of
 // running the inner optimizer with these ScoreWeights" without using
 // another weighted sum. CorpusReport summarises one full corpus run and
 // `compare_reports` is the lexicographic comparator that orders them.
 //
 // Tier 1 — count of letters violating each hard criterion:
 //   1. max unpainted cluster > 0.5 × brush_area   (a feature is missing)
 //   2. bg_painted / total_swept > 5 %
 //   3. SINGLE_STROKE_LETTERS with strokes ≠ 1
 //   4. TWO_STROKE_LETTERS with strokes ≠ 2
 //   5. total_length > 2 × skeleton_length
 //
 // Tier 2 — corpus aggregates (smaller is better, in this order):
 //   6. total bg pixels
 //   7. total stroke count
 //   8. total density-weighted unpainted (Σ size^1.5)
 //   9. total repaint
 //  10. total length
 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub struct CorpusReport {
    // Tier 1 — counts of failing letters.
    pub fail_coverage:        u32,   // max cluster > 0.5 × brush_area
    pub fail_bg:              u32,   // bg/swept > 5%
    pub fail_single_stroke:   u32,
    pub fail_two_stroke:      u32,
    pub fail_length_budget:   u32,   // total_length > 2 × skeleton_length
    // Tier 2 — corpus-wide totals (smaller is better).
    pub total_bg:             u64,
    pub total_strokes:        u64,
    pub total_unpainted_density: f64,
    pub total_repaint:        u64,
    pub total_length:         f64,
    // Bookkeeping (not used in comparator).
    pub n_letters:            u32,
 }
 impl CorpusReport {
    pub fn build(letter_metrics: &[(char, PaintMetrics)]) -> Self {
        let mut r = CorpusReport { n_letters: letter_metrics.len() as u32, ..Default::default() };
        for (ch, m) in letter_metrics {
            // Tier 1.
            let cluster_threshold = 0.5 * std::f32::consts::PI * m.brush_radius * m.brush_radius;
            let max_cluster = m.unpainted_clusters.iter().copied().max().unwrap_or(0);
            if (max_cluster as f32) > cluster_threshold { r.fail_coverage += 1; }
            if m.total_swept > 0 {
                let bg_rate = m.bg_painted as f32 / m.total_swept as f32;
                if bg_rate > 0.05 { r.fail_bg += 1; }
            }
            if is_single_stroke_letter(*ch) && m.strokes != 1 { r.fail_single_stroke += 1; }
            if is_two_stroke_letter(*ch)    && m.strokes != 2 { r.fail_two_stroke    += 1; }
            if m.skeleton_length > 0 && m.total_length > 2.0 * m.skeleton_length as f32 {
                r.fail_length_budget += 1;
            }
            // Tier 2.
            r.total_bg          += m.bg_painted as u64;
            r.total_strokes     += m.strokes as u64;
            r.total_repaint     += m.repaint as u64;
            r.total_length      += m.total_length as f64;
            for &cz in &m.unpainted_clusters {
                r.total_unpainted_density += (cz as f64).powf(1.5);
            }
        }
        r
    }
    /// Sum of all Tier 1 fail counts. Useful as a "feasibility score".
    pub fn tier1_total(&self) -> u32 {
        self.fail_coverage + self.fail_bg + self.fail_single_stroke
          + self.fail_two_stroke + self.fail_length_budget
    }
    pub fn summary(&self) -> String {
        format!(
            "T1[cov={} bg={} 1stk={} 2stk={} len={}] T2[bg={} stk={} dens={:.0} rep={} len={:.0}]",
            self.fail_coverage, self.fail_bg, self.fail_single_stroke,
            self.fail_two_stroke, self.fail_length_budget,
            self.total_bg, self.total_strokes, self.total_unpainted_density,
            self.total_repaint, self.total_length,
        )
    }
 }
 /// Lexicographic compare. Returns `Less` if `a` is BETTER than `b`
 /// (sorts in ascending order = best first).
 pub fn compare_reports(a: &CorpusReport, b: &CorpusReport) -> Ordering {
    macro_rules! cmp_field { ($f:ident) => {
        match a.$f.cmp(&b.$f) {
            Ordering::Equal => {},
            non_eq => return non_eq,
        }
    }; }
    macro_rules! cmp_float { ($f:ident) => {
        match a.$f.partial_cmp(&b.$f).unwrap_or(Ordering::Equal) {
            Ordering::Equal => {},
            non_eq => return non_eq,
        }
    }; }
    // Tier 1: count of letters failing each hard criterion.
    cmp_field!(fail_coverage);
    cmp_field!(fail_bg);
    cmp_field!(fail_single_stroke);
    cmp_field!(fail_two_stroke);
    cmp_field!(fail_length_budget);
    // Tier 2: aggregates.
    cmp_field!(total_bg);
    cmp_field!(total_strokes);
    cmp_float!(total_unpainted_density);
    cmp_field!(total_repaint);
    cmp_float!(total_length);
    Ordering::Equal
 }
 /// Run the inner optimizer (multi-start refinement) under given
 /// ScoreWeights, evaluate the resulting params on the corpus, and
 /// return both the best params and a CorpusReport. Used by the
 /// meta-optimizer's outer evaluation.
 pub fn evaluate_score_weights(
    weights:    &ScoreWeights,
    corpus:     &[(char, Hull)],
    axes:       &[Axis],
    base:       &PaintParams,
    n_starts:   usize,
    max_passes: u32,
 ) -> (PaintParams, CorpusReport) {
    // Inner score function uses the supplied weights.
    let inner_score = |p: &PaintParams| -> f32 {
        corpus.par_iter().map(|(ch, hull)| {
            let (_, m) = metrics_for(hull, p);
            let mut s = score_for_letter_with_weights(*ch, &m, weights);
            // score_for_letter already includes the constraint barriers.
            s += 0.0;   // placeholder
            s
        }).sum()
    };
    let try_axis = |params: &PaintParams, axis: &Axis, v: f32| -> f32 {
        let mut p = params.clone();
        let v = if axis.is_int { v.round().clamp(axis.lo, axis.hi) } else { v.clamp(axis.lo, axis.hi) };
        (axis.set)(&mut p, v);
        inner_score(&p)
    };
    let golden = |params: &PaintParams, axis: &Axis, iters: u32| -> (f32, f32) {
        const PHI: f32 = 0.6180339887;
        let (mut a, mut b) = (axis.lo, axis.hi);
        let mut x1 = b - PHI * (b - a); let mut x2 = a + PHI * (b - a);
        let mut f1 = try_axis(params, axis, x1); let mut f2 = try_axis(params, axis, x2);
        for _ in 0..iters {
            if f1 < f2 { b = x2; x2 = x1; f2 = f1; x1 = b - PHI * (b - a); f1 = try_axis(params, axis, x1); }
            else       { a = x1; x1 = x2; f1 = f2; x2 = a + PHI * (b - a); f2 = try_axis(params, axis, x2); }
            if axis.is_int && (b - a) < 1.0 { break; }
        }
        if f1 < f2 { (if axis.is_int { x1.round() } else { x1 }, f1) }
        else       { (if axis.is_int { x2.round() } else { x2 }, f2) }
    };
    let refine = |start: &PaintParams| -> (PaintParams, f32) {
        let mut current = start.clone();
        let mut current_score = inner_score(&current);
        for _ in 0..max_passes {
            let per_axis: Vec<(usize, f32, f32)> = axes.par_iter().enumerate().map(|(ai, ax)| {
                let (v, s) = golden(&current, ax, 8);   // fewer iters than full inner — meta is outer of outer
                (ai, v, s)
            }).collect();
            let (best_ai, best_v, best_s) = per_axis.iter()
                .min_by(|a, b| a.2.partial_cmp(&b.2).unwrap()).cloned().unwrap();
            if best_s + 1.0 >= current_score { break; }
            (axes[best_ai].set)(&mut current, best_v);
            current_score = best_s;
        }
        (current, current_score)
    };
    // Run all starts in parallel. Print a progress dot as each start
    // completes (relative to a Mutex<usize> counter) so the user can
    // see something happening during long meta-optimization runs.
    let starts: Vec<PaintParams> = (0..n_starts)
        .map(|i| build_start_params(i, base, axes))
        .collect();
    let counter = std::sync::atomic::AtomicUsize::new(0);
    let results: Vec<(PaintParams, f32)> = starts.par_iter().map(|s| {
        let r = refine(s);
        let n = counter.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + 1;
        eprint!("\r[inner]   refined {n}/{n_starts}");
        if n == n_starts { eprint!("\n"); }
        r
    }).collect();
    let (best_params, _) = results.into_iter()
        .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()).unwrap();
    // Build the CorpusReport on the BEST params.
    let letter_metrics: Vec<(char, PaintMetrics)> = corpus.iter()
        .map(|(ch, hull)| {
            let (_, m) = metrics_for(hull, &best_params);
            (*ch, m)
        }).collect();
    let report = CorpusReport::build(&letter_metrics);
    (best_params, report)
 }
 /// Same shape as score_for_letter but takes ScoreWeights so the meta-
 /// optimizer can vary them. Mirrors the original `score_for_letter`
 /// barriers exactly; the only thing the meta optimizer changes is the
 /// soft-term weights (ScoreWeights), not the hard barriers.
 fn score_for_letter_with_weights(ch: char, m: &PaintMetrics, w: &ScoreWeights) -> f32 {
    use crate::brush_paint::score_weighted;
    let mut s = score_weighted(m, *w);
    if m.total_swept > 0 {
        let bg_rate = m.bg_painted as f32 / m.total_swept as f32;
        if bg_rate > 0.05 {
            s += 100_000_000.0 * (bg_rate - 0.05);
        }
    }
    let cluster_threshold = 0.5 * std::f32::consts::PI * m.brush_radius * m.brush_radius;
    let max_cluster = m.unpainted_clusters.iter().copied().max().unwrap_or(0) as f32;
    if max_cluster > cluster_threshold {
        s += 1_000_000.0 * (max_cluster - cluster_threshold);
    }
    if m.skeleton_length > 0 && m.total_length > 2.0 * m.skeleton_length as f32 {
        s += 100_000.0 * (m.total_length - 2.0 * m.skeleton_length as f32);
    }
    if m.strokes == 0 { s += 200_000.0; }
    if is_single_stroke_letter(ch) && m.strokes != 1 {
        s += 50_000.0 * ((m.strokes as i64 - 1).abs() as f32);
    }
    if is_two_stroke_letter(ch) && m.strokes != 2 {
        s += 50_000.0 * ((m.strokes as i64 - 2).abs() as f32);
    }
    s
 }
 // ─── Meta-optimizer (outer search over ScoreWeights) ────────────────────
 /// One outer-sample's outcome: the candidate weights, the best inner
 /// params they produced, and the lexicographic report.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct MetaResult {
    pub idx:     usize,
    pub weights: ScoreWeights,
    pub params:  PaintParams,
    pub report:  CorpusReport,
 }
 /// Sample one ScoreWeights from a deterministic per-index PRNG. The
 /// ranges are picked to roughly bracket the existing defaults at ½×–4×.
 pub fn build_meta_weights(idx: usize) -> ScoreWeights {
    if idx == 0 { return ScoreWeights::default(); }
    let mut state = (idx as u64)
        .wrapping_mul(0xDA942042E4DD58B5)
        .wrapping_add(0xCAFEBABE);
    let next = |state: &mut u64| -> f32 {
        *state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
        ((state.wrapping_shr(33)) as u32 as f32) / (u32::MAX as f32)
    };
    let unif = |state: &mut u64, lo: f32, hi: f32| lo + next(state) * (hi - lo);
    ScoreWeights {
        stroke:            unif(&mut state, 100.0,  2000.0),
        length:            unif(&mut state, 0.5,    20.0),
        bg:                unif(&mut state, 10.0,   200.0),
        repaint:           unif(&mut state, 5.0,    100.0),
        unpainted:         unif(&mut state, 5.0,    300.0),
        unpainted_density: unif(&mut state, 1.0,    50.0),
        length_excess:     unif(&mut state, 50.0,   1500.0),
        curvature:         unif(&mut state, 50.0,   2000.0),
        brush_size:        unif(&mut state, 0.0,    8000.0),
    }
 }
 /// Run the meta-optimizer: try `n_outer` random ScoreWeights, run the
 /// inner optimizer for each, rank lexicographically, return ALL results
 /// sorted best-first.
 ///
 /// Progress: prints one line to stderr per outer sample as it
 /// completes — sample idx, elapsed, this-result's tier-1/tier-2
 /// summary, and whether it's the best yet (★ = improved).
 pub fn run_meta_opt(
    n_outer:        usize,
    n_inner_starts: usize,
    inner_passes:   u32,
    base:           &PaintParams,
 ) -> Vec<MetaResult> {
    let axes = default_axes();
    let corpus = build_corpus();
    let t_start = std::time::Instant::now();
    eprintln!("[meta] starting {} outer × {} inner × {} passes",
        n_outer, n_inner_starts, inner_passes);
    let mut results: Vec<MetaResult> = Vec::with_capacity(n_outer);
    let mut best_so_far_report: Option<CorpusReport> = None;
    for idx in 0..n_outer {
        let t0 = std::time::Instant::now();
        let weights = build_meta_weights(idx);
        let (params, report) = evaluate_score_weights(
            &weights, &corpus, &axes, base, n_inner_starts, inner_passes
        );
        let dt = t0.elapsed().as_secs_f64();
        let total = t_start.elapsed().as_secs_f64();
        let est_remaining = (total / (idx as f64 + 1.0)) * (n_outer as f64 - idx as f64 - 1.0);
        let is_new_best = match &best_so_far_report {
            None => true,
            Some(b) => compare_reports(&report, b) == std::cmp::Ordering::Less,
        };
        let marker = if is_new_best { "★" } else { " " };
        eprintln!(
            "[meta] {}{:3}/{}  {:6.1}s   {}   (total {:.0}s, eta {:.0}s)",
            marker, idx + 1, n_outer, dt, report.summary(), total, est_remaining,
        );
        if is_new_best {
            best_so_far_report = Some(report.clone());
        }
        results.push(MetaResult { idx, weights, params, report });
    }
    eprintln!("[meta] done, lex-sorting {} results", results.len());
    results.sort_by(|a, b| compare_reports(&a.report, &b.report));
    results
 }