diff --git a/scripts/optimize_distributed.sh b/scripts/optimize_distributed.sh
index a32cce6d..a94592cb 100755
--- a/scripts/optimize_distributed.sh
+++ b/scripts/optimize_distributed.sh
@@ -43,13 +43,16 @@ echo "[orch] split: $LOCAL_N local, $REMOTE_N remote" >&2
 echo "[orch] cargo build --release --bin paint_opt_worker (local)…" >&2
 ( cd "$ROOT" && cargo build --release --bin paint_opt_worker ) >&2
 
-# Build remote binary in parallel if a remote is configured.
+# Build remote binary in parallel if a remote is configured. We invoke
+# remote commands via `bash -lc` so the login shell sources whatever
+# rc files put cargo on PATH (~/.cargo/bin); without that, ssh's
+# non-login shell may fail with "cargo: command not found".
 REMOTE_BUILD_PID=""
 if [[ -n "$REMOTE" ]]; then
     HOST="${REMOTE%%:*}"
     RPATH="${REMOTE#*:}"
     echo "[orch] cargo build --release on remote ($HOST:$RPATH)…" >&2
-    ( ssh "$HOST" "cd '$RPATH' && cargo build --release --bin paint_opt_worker" >&2 ) &
+    ( ssh "$HOST" "bash -lc 'cd \"$RPATH\" && cargo build --release --bin paint_opt_worker'" >&2 ) &
     REMOTE_BUILD_PID=$!
 fi
 
@@ -83,7 +86,7 @@ if [[ -n "$REMOTE" && "$REMOTE_N" -gt 0 ]]; then
     echo "[orch] launching $REMOTE_N remote workers on $HOST…" >&2
     # Generate the index list local-side and stream to xargs over ssh.
     seq "$LOCAL_N" $((N - 1)) | \
-        ssh "$HOST" "cd '$RPATH' && xargs -n1 -P$REMOTE_N -I{} ./target/release/paint_opt_worker {}" \
+        ssh "$HOST" "bash -lc 'cd \"$RPATH\" && xargs -n1 -P$REMOTE_N -I{} ./target/release/paint_opt_worker {}'" \
         >> "$REMOTE_OUT" &
     REMOTE_PID=$!
 fi
diff --git a/src/brush_paint.rs b/src/brush_paint.rs
index d3d1f4ea..85610eea 100644
--- a/src/brush_paint.rs
+++ b/src/brush_paint.rs
@@ -205,6 +205,11 @@ pub struct PaintDebug {
     /// near this; well under 1.5× this means efficient, well over means
     /// the path is snaking.
     pub skeleton_length: u32,
+    /// Sizes of every connected component in the *unpainted* ink mask
+    /// at the end of painting. Lets the score function tell scattered
+    /// edge slop ("ten 1-px clusters") apart from feature-sized misses
+    /// ("one 30-px crossbar tip cluster").
+    pub unpainted_clusters: Vec<u32>,
     /// Raw trajectories (one per stroke), pre-smoothing.
     pub trajectories:  Vec<Vec<(f32, f32)>>,
     /// Final smoothed strokes (what would go to gcode).
@@ -517,6 +522,44 @@ impl Grid {
     }
 
     /// True iff `(x, y)` is an originally-ink pixel.
+    /// 4-connected connected-component analysis on the *currently
+    /// unpainted* ink mask. Returns one size per CC, in pixels.
+    /// Used for density-aware coverage scoring: a single 30-px cluster
+    /// (a missed crossbar tip) is recognisable as missing whereas the
+    /// same 30 pixels split into 30 single-pixel scattered slop is just
+    /// brush-edge noise.
+    fn unpainted_cluster_sizes(&self) -> Vec<u32> {
+        let n = self.unpainted.len();
+        let mut comp_id = vec![-1i32; n];
+        let mut sizes: Vec<u32> = Vec::new();
+        for sy in 0..self.height {
+            for sx in 0..self.width {
+                let s_idx = (sy * self.width + sx) as usize;
+                if !self.unpainted[s_idx] || comp_id[s_idx] >= 0 { continue; }
+                let id = sizes.len() as i32;
+                let mut size = 0u32;
+                let mut stack: Vec<(i32, i32)> = vec![(sx, sy)];
+                while let Some((cx, cy)) = stack.pop() {
+                    let cidx = (cy * self.width + cx) as usize;
+                    if comp_id[cidx] >= 0 { continue; }
+                    if !self.unpainted[cidx] { continue; }
+                    comp_id[cidx] = id;
+                    size += 1;
+                    for (dx, dy) in [(1, 0i32), (-1, 0), (0, 1), (0, -1)] {
+                        let nx = cx + dx; let ny = cy + dy;
+                        if nx < 0 || ny < 0 || nx >= self.width || ny >= self.height { continue; }
+                        let nidx = (ny * self.width + nx) as usize;
+                        if self.unpainted[nidx] && comp_id[nidx] < 0 {
+                            stack.push((nx, ny));
+                        }
+                    }
+                }
+                sizes.push(size);
+            }
+        }
+        sizes
+    }
+
     fn is_ink(&self, x: i32, y: i32) -> bool {
         let lx = x - self.bx; let ly = y - self.by;
         if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return false; }
@@ -1381,7 +1424,7 @@ pub fn paint_fill_with(hull: &Hull, params: &PaintParams) -> FillResult {
 /// Quantitative summary of one fill result. Computed cheaply from a
 /// `FillResult` plus the source hull (the hull is needed to count
 /// background paint, since FillResult only has stroke geometry).
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone)]
 pub struct PaintMetrics {
     /// Number of strokes (= pen lifts + 1, or 0 if no strokes).
     pub strokes:        u32,
@@ -1405,6 +1448,11 @@ pub struct PaintMetrics {
     /// "ideal" path length budget — `total_length` should sit close to
     /// this for efficient single-pass tracing.
     pub skeleton_length: u32,
+    /// Sizes of unpainted-ink connected components after the algorithm
+    /// finishes. Density signal: one 30-px cluster (a real missing
+    /// feature) reads worse than thirty 1-px scattered slop pixels even
+    /// though both have the same total unpainted count.
+    pub unpainted_clusters: Vec<u32>,
     /// Sum of absolute angle changes between consecutive segments along
     /// every stroke, in radians. Smooth handwriting has small total
     /// curvature; jagged zigzag accumulates lots.
@@ -1445,16 +1493,17 @@ pub fn metrics_for(hull: &Hull, params: &PaintParams) -> (FillResult, PaintMetri
         c
     }).sum();
     let m = PaintMetrics {
-        strokes:         strokes.len() as u32,
+        strokes:            strokes.len() as u32,
         total_length,
-        bg_painted:      dbg.bg_painted,
-        total_swept:     dbg.total_swept,
-        repaint:         dbg.repaint,
-        ink_total:       dbg.ink_total,
-        ink_unpainted:   dbg.ink_unpainted,
-        skeleton_length: dbg.skeleton_length,
+        bg_painted:         dbg.bg_painted,
+        total_swept:        dbg.total_swept,
+        repaint:            dbg.repaint,
+        ink_total:          dbg.ink_total,
+        ink_unpainted:      dbg.ink_unpainted,
+        skeleton_length:    dbg.skeleton_length,
+        unpainted_clusters: dbg.unpainted_clusters.clone(),
         curvature,
-        brush_radius:    dbg.brush_radius,
+        brush_radius:       dbg.brush_radius,
     };
     (FillResult { hull_id: hull.id, strokes }, m)
 }
@@ -1465,10 +1514,20 @@ pub fn metrics_for(hull: &Hull, params: &PaintParams) -> (FillResult, PaintMetri
 /// each one has a known single-stroke topology (possibly with double-back).
 pub const SINGLE_STROKE_LETTERS: &str = "CGIJLMNOSUVWZcejilosvwz";
 
+/// Letters whose natural human topology is *exactly two* pen strokes.
+/// Crosses (T/t/X/x/+) and Y-junctions where one continuous stroke
+/// would require an unnatural double-back across the cross. Constraint
+/// penalty applies when stroke count ≠ 2.
+pub const TWO_STROKE_LETTERS: &str = "TtXxKkYyFfHh";
+
 pub fn is_single_stroke_letter(ch: char) -> bool {
     SINGLE_STROKE_LETTERS.contains(ch)
 }
 
+pub fn is_two_stroke_letter(ch: char) -> bool {
+    TWO_STROKE_LETTERS.contains(ch)
+}
+
 /// Letter-aware score: applies the default score plus hard constraint
 /// failures. A config that trips ANY hard ceiling returns f32::MAX so
 /// the optimizer rejects it outright. Soft knobs (brush_size bonus,
@@ -1503,11 +1562,14 @@ pub fn score_for_letter(ch: char, m: &PaintMetrics) -> f32 {
             s += 100_000_000.0 * (bg_rate - 0.05);
         }
     }
-    if m.ink_total > 0 {
-        let unpainted_rate = m.ink_unpainted as f32 / m.ink_total as f32;
-        if unpainted_rate > 0.05 {
-            s += 100_000_000.0 * (unpainted_rate - 0.05);
-        }
+    // Density-aware unpainted barrier. A cluster bigger than half the
+    // brush footprint = a recognisable feature is missing (a crossbar
+    // tip, half a loop, etc.). Scattered single-pixel slop never trips
+    // this; one 30-px cluster does. Threshold scales with brush.
+    let cluster_threshold = 0.5 * std::f32::consts::PI * m.brush_radius * m.brush_radius;
+    let max_cluster = m.unpainted_clusters.iter().copied().max().unwrap_or(0) as f32;
+    if max_cluster > cluster_threshold {
+        s += 1_000_000.0 * (max_cluster - cluster_threshold);
     }
     if m.skeleton_length > 0 && m.total_length > 2.0 * m.skeleton_length as f32 {
         // Length budget: 100k/px above 2× skel. For a 300-px-skeleton
@@ -1522,6 +1584,10 @@ pub fn score_for_letter(ch: char, m: &PaintMetrics) -> f32 {
         let delta = (m.strokes as i64 - 1).abs() as f32;
         s += 50_000.0 * delta;
     }
+    if is_two_stroke_letter(ch) && m.strokes != 2 {
+        let delta = (m.strokes as i64 - 2).abs() as f32;
+        s += 50_000.0 * delta;
+    }
     s
 }
 
@@ -1537,13 +1603,22 @@ pub fn default_score(m: &PaintMetrics) -> f32 {
     score_weighted(m, ScoreWeights::default())
 }
 
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
 pub struct ScoreWeights {
     pub stroke:    f32,
     pub length:    f32,
     pub bg:        f32,
     pub repaint:   f32,
+    /// Linear unpainted-pixel cost. Cheap by design — the heavy lifting
+    /// is done by `unpainted_density` which super-linearly weights
+    /// large clusters. Linear stays for tie-breaking among configs that
+    /// have similar density patterns.
     pub unpainted: f32,
+    /// Density-aware unpainted cost. Penalty per letter is
+    /// `weight × Σ over clusters of size^1.5`. A 30-px cluster (a
+    /// recognisable missing feature) costs ~5× a 30-px scattered slop,
+    /// matching how visible each is on the printed page.
+    pub unpainted_density: f32,
     /// Per-pixel cost of stroke length above 1.5× the skeleton length
     /// (the "ideal" trace). 0 inside budget; ramps up sharply outside.
     pub length_excess: f32,
@@ -1571,14 +1646,15 @@ impl Default for ScoreWeights {
         // few unpainted pixels over a larger-radius solution that paints
         // 50× as many bg pixels.
         Self {
-            stroke:        500.0,
-            length:        5.0,
-            bg:            50.0,
-            repaint:       30.0,
-            unpainted:     200.0,
-            length_excess: 300.0,
-            curvature:     500.0,
-            brush_size:    2000.0,   // pressure toward bigger brush. Per
+            stroke:            500.0,
+            length:            5.0,
+            bg:                50.0,
+            repaint:           30.0,
+            unpainted:         50.0,    // mild — density carries the weight
+            unpainted_density: 10.0,    // cluster_size^1.5 × 10
+            length_excess:     300.0,
+            curvature:         500.0,
+            brush_size:        2000.0,   // pressure toward bigger brush. Per
                                      // letter, +1 px brush = +2000 bonus;
                                      // vs bg=50/px that's "worth" up to
                                      // ~40 extra bg pixels per letter. So
@@ -1593,14 +1669,18 @@ impl Default for ScoreWeights {
 pub fn score_weighted(m: &PaintMetrics, w: ScoreWeights) -> f32 {
     let budget = 1.5 * m.skeleton_length as f32;
     let excess = (m.total_length - budget).max(0.0);
-    w.stroke        * m.strokes as f32
-  + w.length        * m.total_length
-  + w.bg            * m.bg_painted as f32
-  + w.repaint       * m.repaint as f32
-  + w.unpainted     * m.ink_unpainted as f32
-  + w.length_excess * excess
-  + w.curvature     * m.curvature
-  - w.brush_size    * m.brush_radius
+    let density: f32 = m.unpainted_clusters.iter()
+        .map(|&n| (n as f32).powf(1.5))
+        .sum();
+    w.stroke            * m.strokes as f32
+  + w.length            * m.total_length
+  + w.bg                * m.bg_painted as f32
+  + w.repaint           * m.repaint as f32
+  + w.unpainted         * m.ink_unpainted as f32
+  + w.unpainted_density * density
+  + w.length_excess     * excess
+  + w.curvature         * m.curvature
+  - w.brush_size        * m.brush_radius
 }
 
 /// Run `paint_fill_with` once per `params_variant` and return the best
@@ -1792,6 +1872,7 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
     let ink_unpainted = grid.ink_remaining.max(0) as u32;
     let (bg_painted, total_swept, repaint) = measure_sweep_full(&strokes, &grid, brush_radius);
     let skeleton_length = grid.skeleton_length;
+    let unpainted_clusters = grid.unpainted_cluster_sizes();
     PaintDebug {
         bounds,
         source_b64:   crate::streamline::encode_hull_pixels_b64(hull),
@@ -1805,6 +1886,7 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
         total_swept,
         repaint,
         skeleton_length,
+        unpainted_clusters,
         trajectories,
         strokes,
         start_points: starts,
@@ -2115,6 +2197,63 @@ mod tests {
     /// descent visits N × K candidates per pass (vs Kᴺ for the grid),
     /// and the parallel inner loop evaluates each candidate against
     /// the whole corpus in one go.
+    /// Outer (meta) optimizer: searches ScoreWeights space and ranks
+    /// each candidate by the lexicographic comparator (`compare_reports`)
+    /// instead of a hand-tuned weighted sum. The ordering is hard:
+    /// fewer letters with feature-sized unpainted clusters > fewer with
+    /// >5% bg > fewer single-stroke-letter constraint violations >
+    /// fewer two-stroke-letter constraint violations > fewer length
+    /// over-budget > then aggregate totals as tiebreakers.
+    #[test]
+    #[ignore]
+    fn paint_meta_optimize() {
+        use crate::brush_paint_opt::{run_meta_opt, compare_reports};
+
+        let base = PaintParams::default();
+        // Smoke-test sizes — change for real runs. With these defaults
+        // each meta-iteration is ~25-40s on an 8-core laptop; the full
+        // 12×8×3 run takes ~40 min serial (use the SSH orchestrator if
+        // you want it faster).
+        let n_outer = std::env::var("META_N_OUTER").ok().and_then(|s| s.parse().ok()).unwrap_or(4);
+        let n_inner_starts = std::env::var("META_N_INNER").ok().and_then(|s| s.parse().ok()).unwrap_or(4);
+        let inner_passes   = std::env::var("META_PASSES").ok().and_then(|s| s.parse().ok()).unwrap_or(2);
+
+        println!("\n[meta] {} outer × {} inner starts × {} passes",
+            n_outer, n_inner_starts, inner_passes);
+        let t0 = std::time::Instant::now();
+        let results = run_meta_opt(n_outer, n_inner_starts, inner_passes, &base);
+        let elapsed = t0.elapsed();
+
+        println!("\n[meta] {} results in {:.1}s, lex-sorted best-first:",
+            results.len(), elapsed.as_secs_f64());
+        for (rank, r) in results.iter().enumerate() {
+            println!("  #{:2} idx={:2} {}", rank+1, r.idx, r.report.summary());
+        }
+        let best = &results[0];
+        let _ = compare_reports;  // silence unused warn
+        println!("\n[meta] BEST WEIGHTS (idx={}):", best.idx);
+        println!("  stroke           = {:.0}", best.weights.stroke);
+        println!("  length           = {:.2}", best.weights.length);
+        println!("  bg               = {:.1}", best.weights.bg);
+        println!("  repaint          = {:.1}", best.weights.repaint);
+        println!("  unpainted        = {:.1}", best.weights.unpainted);
+        println!("  unpainted_density= {:.2}", best.weights.unpainted_density);
+        println!("  length_excess    = {:.0}", best.weights.length_excess);
+        println!("  curvature        = {:.0}", best.weights.curvature);
+        println!("  brush_size       = {:.0}", best.weights.brush_size);
+        println!("\n[meta] BEST PAINT PARAMS:");
+        println!("  brush_radius_factor     = {:.2}", best.params.brush_radius_factor);
+        println!("  brush_radius_offset_px  = {:.2}", best.params.brush_radius_offset_px);
+        println!("  brush_radius_percentile = {:.2}", best.params.brush_radius_percentile);
+        println!("  step_size_factor        = {:.2}", best.params.step_size_factor);
+        println!("  walk_bg_penalty         = {:.2}", best.params.walk_bg_penalty);
+        println!("  polish_iters            = {}",   best.params.polish_iters);
+        println!("  polish_search_factor    = {:.2}", best.params.polish_search_factor);
+        println!("  bg_penalty              = {:.2}", best.params.bg_penalty);
+        println!("  pen_lift_penalty        = {:.1}", best.params.pen_lift_penalty);
+        println!("  min_component_factor    = {:.2}", best.params.min_component_factor);
+    }
+
     #[test]
     #[ignore]
     fn paint_optimize_global_defaults() {
diff --git a/src/brush_paint_opt.rs b/src/brush_paint_opt.rs
index 74ad1435..5d9a63be 100644
--- a/src/brush_paint_opt.rs
+++ b/src/brush_paint_opt.rs
@@ -12,10 +12,13 @@
 //! Both share `default_axes` / `build_corpus` / `build_start_params` /
 //! `refine_one` so they search identical landscapes.
 
+use std::cmp::Ordering;
 use rayon::prelude::*;
 use serde::{Serialize, Deserialize};
 use crate::brush_paint::{
-    PaintParams, score_for_letter, metrics_for, rasterize_test_letter,
+    PaintParams, PaintMetrics, ScoreWeights,
+    score_for_letter, metrics_for, rasterize_test_letter,
+    is_single_stroke_letter, is_two_stroke_letter,
 };
 use crate::hulls::Hull;
 
@@ -77,7 +80,10 @@ pub const CORPUS_CASES: &[(f32, u32, u32)] = &[
     (5.0, 200, 4),
     (5.0, 425, 9),
 ];
-pub const CORPUS_ALPHABET: &str = "ACGIJLMNOSUVWXZabcdefijlmosuvwxz";
+// Includes all SINGLE_STROKE_LETTERS, all TWO_STROKE_LETTERS, plus the
+// remaining alphanumerics for breadth. ~52 letters × N scales is the
+// per-inner-eval cost.
+pub const CORPUS_ALPHABET: &str = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 
 pub fn build_corpus() -> Vec<(char, Hull)> {
     CORPUS_CASES.iter().flat_map(|&(mm, dpi, t)| {
@@ -236,3 +242,327 @@ pub fn run_one_start(start_idx: usize, base: &PaintParams, max_passes: u32) -> R
     let (params, score, log) = refine_one(&corpus, &axes, &start, max_passes);
     RefineResult { start_idx, score, params, log }
 }
+
+// ─── Lexicographic outer-ranking ────────────────────────────────────────
+//
+// The outer optimizer (`run_meta_opt`) needs a way to rank "the result of
+// running the inner optimizer with these ScoreWeights" without using
+// another weighted sum. CorpusReport summarises one full corpus run and
+// `compare_reports` is the lexicographic comparator that orders them.
+//
+// Tier 1 — count of letters violating each hard criterion:
+//   1. max unpainted cluster > 0.5 × brush_area   (a feature is missing)
+//   2. bg_painted / total_swept > 5 %
+//   3. SINGLE_STROKE_LETTERS with strokes ≠ 1
+//   4. TWO_STROKE_LETTERS with strokes ≠ 2
+//   5. total_length > 2 × skeleton_length
+//
+// Tier 2 — corpus aggregates (smaller is better, in this order):
+//   6. total bg pixels
+//   7. total stroke count
+//   8. total density-weighted unpainted (Σ size^1.5)
+//   9. total repaint
+//  10. total length
+
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct CorpusReport {
+    // Tier 1 — counts of failing letters.
+    pub fail_coverage:        u32,   // max cluster > 0.5 × brush_area
+    pub fail_bg:              u32,   // bg/swept > 5%
+    pub fail_single_stroke:   u32,
+    pub fail_two_stroke:      u32,
+    pub fail_length_budget:   u32,   // total_length > 2 × skeleton_length
+    // Tier 2 — corpus-wide totals (smaller is better).
+    pub total_bg:             u64,
+    pub total_strokes:        u64,
+    pub total_unpainted_density: f64,
+    pub total_repaint:        u64,
+    pub total_length:         f64,
+    // Bookkeeping (not used in comparator).
+    pub n_letters:            u32,
+}
+
+impl CorpusReport {
+    pub fn build(letter_metrics: &[(char, PaintMetrics)]) -> Self {
+        let mut r = CorpusReport { n_letters: letter_metrics.len() as u32, ..Default::default() };
+        for (ch, m) in letter_metrics {
+            // Tier 1.
+            let cluster_threshold = 0.5 * std::f32::consts::PI * m.brush_radius * m.brush_radius;
+            let max_cluster = m.unpainted_clusters.iter().copied().max().unwrap_or(0);
+            if (max_cluster as f32) > cluster_threshold { r.fail_coverage += 1; }
+
+            if m.total_swept > 0 {
+                let bg_rate = m.bg_painted as f32 / m.total_swept as f32;
+                if bg_rate > 0.05 { r.fail_bg += 1; }
+            }
+            if is_single_stroke_letter(*ch) && m.strokes != 1 { r.fail_single_stroke += 1; }
+            if is_two_stroke_letter(*ch)    && m.strokes != 2 { r.fail_two_stroke    += 1; }
+            if m.skeleton_length > 0 && m.total_length > 2.0 * m.skeleton_length as f32 {
+                r.fail_length_budget += 1;
+            }
+
+            // Tier 2.
+            r.total_bg          += m.bg_painted as u64;
+            r.total_strokes     += m.strokes as u64;
+            r.total_repaint     += m.repaint as u64;
+            r.total_length      += m.total_length as f64;
+            for &cz in &m.unpainted_clusters {
+                r.total_unpainted_density += (cz as f64).powf(1.5);
+            }
+        }
+        r
+    }
+
+    /// Sum of all Tier 1 fail counts. Useful as a "feasibility score".
+    pub fn tier1_total(&self) -> u32 {
+        self.fail_coverage + self.fail_bg + self.fail_single_stroke
+          + self.fail_two_stroke + self.fail_length_budget
+    }
+
+    pub fn summary(&self) -> String {
+        format!(
+            "T1[cov={} bg={} 1stk={} 2stk={} len={}] T2[bg={} stk={} dens={:.0} rep={} len={:.0}]",
+            self.fail_coverage, self.fail_bg, self.fail_single_stroke,
+            self.fail_two_stroke, self.fail_length_budget,
+            self.total_bg, self.total_strokes, self.total_unpainted_density,
+            self.total_repaint, self.total_length,
+        )
+    }
+}
+
+/// Lexicographic compare. Returns `Less` if `a` is BETTER than `b`
+/// (sorts in ascending order = best first).
+pub fn compare_reports(a: &CorpusReport, b: &CorpusReport) -> Ordering {
+    macro_rules! cmp_field { ($f:ident) => {
+        match a.$f.cmp(&b.$f) {
+            Ordering::Equal => {},
+            non_eq => return non_eq,
+        }
+    }; }
+    macro_rules! cmp_float { ($f:ident) => {
+        match a.$f.partial_cmp(&b.$f).unwrap_or(Ordering::Equal) {
+            Ordering::Equal => {},
+            non_eq => return non_eq,
+        }
+    }; }
+    // Tier 1: count of letters failing each hard criterion.
+    cmp_field!(fail_coverage);
+    cmp_field!(fail_bg);
+    cmp_field!(fail_single_stroke);
+    cmp_field!(fail_two_stroke);
+    cmp_field!(fail_length_budget);
+    // Tier 2: aggregates.
+    cmp_field!(total_bg);
+    cmp_field!(total_strokes);
+    cmp_float!(total_unpainted_density);
+    cmp_field!(total_repaint);
+    cmp_float!(total_length);
+    Ordering::Equal
+}
+
+/// Run the inner optimizer (multi-start refinement) under given
+/// ScoreWeights, evaluate the resulting params on the corpus, and
+/// return both the best params and a CorpusReport. Used by the
+/// meta-optimizer's outer evaluation.
+pub fn evaluate_score_weights(
+    weights:    &ScoreWeights,
+    corpus:     &[(char, Hull)],
+    axes:       &[Axis],
+    base:       &PaintParams,
+    n_starts:   usize,
+    max_passes: u32,
+) -> (PaintParams, CorpusReport) {
+    // Inner score function uses the supplied weights.
+    let inner_score = |p: &PaintParams| -> f32 {
+        corpus.par_iter().map(|(ch, hull)| {
+            let (_, m) = metrics_for(hull, p);
+            let mut s = score_for_letter_with_weights(*ch, &m, weights);
+            // score_for_letter already includes the constraint barriers.
+            s += 0.0;   // placeholder
+            s
+        }).sum()
+    };
+
+    let try_axis = |params: &PaintParams, axis: &Axis, v: f32| -> f32 {
+        let mut p = params.clone();
+        let v = if axis.is_int { v.round().clamp(axis.lo, axis.hi) } else { v.clamp(axis.lo, axis.hi) };
+        (axis.set)(&mut p, v);
+        inner_score(&p)
+    };
+    let golden = |params: &PaintParams, axis: &Axis, iters: u32| -> (f32, f32) {
+        const PHI: f32 = 0.6180339887;
+        let (mut a, mut b) = (axis.lo, axis.hi);
+        let mut x1 = b - PHI * (b - a); let mut x2 = a + PHI * (b - a);
+        let mut f1 = try_axis(params, axis, x1); let mut f2 = try_axis(params, axis, x2);
+        for _ in 0..iters {
+            if f1 < f2 { b = x2; x2 = x1; f2 = f1; x1 = b - PHI * (b - a); f1 = try_axis(params, axis, x1); }
+            else       { a = x1; x1 = x2; f1 = f2; x2 = a + PHI * (b - a); f2 = try_axis(params, axis, x2); }
+            if axis.is_int && (b - a) < 1.0 { break; }
+        }
+        if f1 < f2 { (if axis.is_int { x1.round() } else { x1 }, f1) }
+        else       { (if axis.is_int { x2.round() } else { x2 }, f2) }
+    };
+    let refine = |start: &PaintParams| -> (PaintParams, f32) {
+        let mut current = start.clone();
+        let mut current_score = inner_score(&current);
+        for _ in 0..max_passes {
+            let per_axis: Vec<(usize, f32, f32)> = axes.par_iter().enumerate().map(|(ai, ax)| {
+                let (v, s) = golden(&current, ax, 8);   // fewer iters than full inner — meta is outer of outer
+                (ai, v, s)
+            }).collect();
+            let (best_ai, best_v, best_s) = per_axis.iter()
+                .min_by(|a, b| a.2.partial_cmp(&b.2).unwrap()).cloned().unwrap();
+            if best_s + 1.0 >= current_score { break; }
+            (axes[best_ai].set)(&mut current, best_v);
+            current_score = best_s;
+        }
+        (current, current_score)
+    };
+
+    // Run all starts in parallel. Print a progress dot as each start
+    // completes (relative to a Mutex<usize> counter) so the user can
+    // see something happening during long meta-optimization runs.
+    let starts: Vec<PaintParams> = (0..n_starts)
+        .map(|i| build_start_params(i, base, axes))
+        .collect();
+    let counter = std::sync::atomic::AtomicUsize::new(0);
+    let results: Vec<(PaintParams, f32)> = starts.par_iter().map(|s| {
+        let r = refine(s);
+        let n = counter.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + 1;
+        eprint!("\r[inner]   refined {n}/{n_starts}");
+        if n == n_starts { eprint!("\n"); }
+        r
+    }).collect();
+    let (best_params, _) = results.into_iter()
+        .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()).unwrap();
+
+    // Build the CorpusReport on the BEST params.
+    let letter_metrics: Vec<(char, PaintMetrics)> = corpus.iter()
+        .map(|(ch, hull)| {
+            let (_, m) = metrics_for(hull, &best_params);
+            (*ch, m)
+        }).collect();
+    let report = CorpusReport::build(&letter_metrics);
+    (best_params, report)
+}
+
+/// Same shape as score_for_letter but takes ScoreWeights so the meta-
+/// optimizer can vary them. Mirrors the original `score_for_letter`
+/// barriers exactly; the only thing the meta optimizer changes is the
+/// soft-term weights (ScoreWeights), not the hard barriers.
+fn score_for_letter_with_weights(ch: char, m: &PaintMetrics, w: &ScoreWeights) -> f32 {
+    use crate::brush_paint::score_weighted;
+    let mut s = score_weighted(m, *w);
+    if m.total_swept > 0 {
+        let bg_rate = m.bg_painted as f32 / m.total_swept as f32;
+        if bg_rate > 0.05 {
+            s += 100_000_000.0 * (bg_rate - 0.05);
+        }
+    }
+    let cluster_threshold = 0.5 * std::f32::consts::PI * m.brush_radius * m.brush_radius;
+    let max_cluster = m.unpainted_clusters.iter().copied().max().unwrap_or(0) as f32;
+    if max_cluster > cluster_threshold {
+        s += 1_000_000.0 * (max_cluster - cluster_threshold);
+    }
+    if m.skeleton_length > 0 && m.total_length > 2.0 * m.skeleton_length as f32 {
+        s += 100_000.0 * (m.total_length - 2.0 * m.skeleton_length as f32);
+    }
+    if m.strokes == 0 { s += 200_000.0; }
+    if is_single_stroke_letter(ch) && m.strokes != 1 {
+        s += 50_000.0 * ((m.strokes as i64 - 1).abs() as f32);
+    }
+    if is_two_stroke_letter(ch) && m.strokes != 2 {
+        s += 50_000.0 * ((m.strokes as i64 - 2).abs() as f32);
+    }
+    s
+}
+
+// ─── Meta-optimizer (outer search over ScoreWeights) ────────────────────
+
+/// One outer-sample's outcome: the candidate weights, the best inner
+/// params they produced, and the lexicographic report.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MetaResult {
+    pub idx:     usize,
+    pub weights: ScoreWeights,
+    pub params:  PaintParams,
+    pub report:  CorpusReport,
+}
+
+/// Sample one ScoreWeights from a deterministic per-index PRNG. The
+/// ranges are picked to roughly bracket the existing defaults at ½×–4×.
+pub fn build_meta_weights(idx: usize) -> ScoreWeights {
+    if idx == 0 { return ScoreWeights::default(); }
+    let mut state = (idx as u64)
+        .wrapping_mul(0xDA942042E4DD58B5)
+        .wrapping_add(0xCAFEBABE);
+    let next = |state: &mut u64| -> f32 {
+        *state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
+        ((state.wrapping_shr(33)) as u32 as f32) / (u32::MAX as f32)
+    };
+    let unif = |state: &mut u64, lo: f32, hi: f32| lo + next(state) * (hi - lo);
+
+    ScoreWeights {
+        stroke:            unif(&mut state, 100.0,  2000.0),
+        length:            unif(&mut state, 0.5,    20.0),
+        bg:                unif(&mut state, 10.0,   200.0),
+        repaint:           unif(&mut state, 5.0,    100.0),
+        unpainted:         unif(&mut state, 5.0,    300.0),
+        unpainted_density: unif(&mut state, 1.0,    50.0),
+        length_excess:     unif(&mut state, 50.0,   1500.0),
+        curvature:         unif(&mut state, 50.0,   2000.0),
+        brush_size:        unif(&mut state, 0.0,    8000.0),
+    }
+}
+
+/// Run the meta-optimizer: try `n_outer` random ScoreWeights, run the
+/// inner optimizer for each, rank lexicographically, return ALL results
+/// sorted best-first.
+///
+/// Progress: prints one line to stderr per outer sample as it
+/// completes — sample idx, elapsed, this-result's tier-1/tier-2
+/// summary, and whether it's the best yet (★ = improved).
+pub fn run_meta_opt(
+    n_outer:        usize,
+    n_inner_starts: usize,
+    inner_passes:   u32,
+    base:           &PaintParams,
+) -> Vec<MetaResult> {
+    let axes = default_axes();
+    let corpus = build_corpus();
+
+    let t_start = std::time::Instant::now();
+    eprintln!("[meta] starting {} outer × {} inner × {} passes",
+        n_outer, n_inner_starts, inner_passes);
+
+    let mut results: Vec<MetaResult> = Vec::with_capacity(n_outer);
+    let mut best_so_far_report: Option<CorpusReport> = None;
+    for idx in 0..n_outer {
+        let t0 = std::time::Instant::now();
+        let weights = build_meta_weights(idx);
+        let (params, report) = evaluate_score_weights(
+            &weights, &corpus, &axes, base, n_inner_starts, inner_passes
+        );
+        let dt = t0.elapsed().as_secs_f64();
+        let total = t_start.elapsed().as_secs_f64();
+        let est_remaining = (total / (idx as f64 + 1.0)) * (n_outer as f64 - idx as f64 - 1.0);
+
+        let is_new_best = match &best_so_far_report {
+            None => true,
+            Some(b) => compare_reports(&report, b) == std::cmp::Ordering::Less,
+        };
+        let marker = if is_new_best { "★" } else { " " };
+        eprintln!(
+            "[meta] {}{:3}/{}  {:6.1}s   {}   (total {:.0}s, eta {:.0}s)",
+            marker, idx + 1, n_outer, dt, report.summary(), total, est_remaining,
+        );
+
+        if is_new_best {
+            best_so_far_report = Some(report.clone());
+        }
+        results.push(MetaResult { idx, weights, params, report });
+    }
+    eprintln!("[meta] done, lex-sorting {} results", results.len());
+    results.sort_by(|a, b| compare_reports(&a.report, &b.report));
+    results
+}