brush-paint: fix optimizer hot-path leaks discovered via samply profile

Profiled `evaluate(corpus, &default)` (the per-call optimizer entry) under samply for 60s. Two big leaks both around `paint_fill_debug`, which `metrics_for` was calling unconditionally: 1. SDF was being recomputed in `paint_fill_with` and `paint_fill_debug` outside the hull cache, just to derive brush_radius from a percentile lookup. Fix: cache the sorted chamfer values in HullData so `sdf_percentile_q(q)` is O(1). Both functions now go through `Grid::from_hull_data(h)` using the same Arc<HullData> they used for the percentile. 2. `paint_fill_debug` always rendered three base64-encoded PNGs (~23% of CPU was in flate2's ZlibEncoder) and recorded full per-step WalkTrace lists, even when called by `metrics_for` which discards both. Fix: split into a private inner function with `record_walks` / `render_pngs` flags; the public `paint_fill_debug` keeps both on, `metrics_for` turns both off. Bench: `evaluate(corpus, &default)` × 1700 iters before: 52 ms/iter after: 35 ms/iter → 1.49× faster Bit-exact w.r.t. the alphabet report. Also adds `src/bin/paint_bench.rs` — a small fixed-duration loop over `evaluate` for profiling under samply / Instruments.
2026-05-07 16:51:05 -07:00
parent 34ee79f543
commit 8043254593
2 changed files with 101 additions and 22 deletions
--- a/src/bin/paint_bench.rs
+++ b/src/bin/paint_bench.rs
@@ -0,0 +1,40 @@
+//! Hot-path bench: build the optimizer corpus once, then loop calling
+//! `evaluate(corpus, &default_params)` for a fixed wall-clock duration.
+//! Prints iter count + ms/iter so you have a baseline number, and
+//! holds the process up long enough that an external profiler
+//! (samply, sample, Instruments) can capture a representative trace.
+//!
+//! Usage: paint_bench [seconds]   (default 60)
+
+use std::time::{Duration, Instant};
+use trac3r_lib::brush_paint::PaintParams;
+use trac3r_lib::brush_paint_opt::{build_corpus, evaluate};
+
+fn main() {
+    let secs: u64 = std::env::args().nth(1)
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(60);
+    eprintln!("[bench] building corpus...");
+    let corpus = build_corpus();
+    eprintln!("[bench] corpus: {} hulls", corpus.len());
+    let params = PaintParams::default();
+    eprintln!("[bench] pid={} running for {}s", std::process::id(), secs);
+
+    // Warm up the hull cache + jit any lazy code paths.
+    let _ = evaluate(&corpus, &params);
+
+    let deadline = Instant::now() + Duration::from_secs(secs);
+    let mut iters = 0u32;
+    let start = Instant::now();
+    while Instant::now() < deadline {
+        let _ = evaluate(&corpus, &params);
+        iters += 1;
+        if iters.is_multiple_of(10) {
+            let elapsed = start.elapsed().as_secs_f64();
+            eprintln!("[bench] {iters} iters, {:.0} ms/iter", 1000.0 * elapsed / iters as f64);
+        }
+    }
+    let elapsed = start.elapsed().as_secs_f64();
+    eprintln!("[bench] DONE: {iters} iters in {:.1}s = {:.0} ms/iter",
+        elapsed, 1000.0 * elapsed / iters as f64);
+}
--- a/src/brush_paint.rs
+++ b/src/brush_paint.rs
@@ -478,11 +478,28 @@ struct HullData {
    width: i32, height: i32,
    was_ink: BitMask,
    sdf: Vec<f32>,
+    /// Sorted chamfer-distance values for the ink pixels (the same set
+    /// `chamfer_distance` returns). Lets `sdf_percentile_q(q)` answer
+    /// in O(1) instead of recomputing chamfer + sort. Critical for
+    /// the optimizer hot path: `paint_fill_with` needs an SDF
+    /// percentile to derive `brush_radius` and was redundantly
+    /// recomputing chamfer per call.
+    sdf_values_sorted: Vec<f32>,
    skel_endpoints: Vec<(i32, i32)>,
    skeleton_length: u32,
    ink_total: i32,
 }

+impl HullData {
+    fn sdf_percentile_q(&self, q: f32) -> f32 {
+        let v = &self.sdf_values_sorted;
+        if v.is_empty() { return 0.0; }
+        let q = q.clamp(0.0, 1.0);
+        let idx = ((v.len() as f32 - 1.0) * q).round() as usize;
+        v[idx.min(v.len() - 1)]
+    }
+}
+
 /// Cache key. `hull.id` alone isn't enough — extract_hulls assigns
 /// IDs from a per-call counter, so distinct hulls from different
 /// rasterizations collide on id. Mirror-image letters (p/q at the
@@ -554,6 +571,8 @@ fn compute_hull_data(hull: &Hull) -> HullData {
        sdf[(ly * width + lx) as usize] = d;
    }
    let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
+    let mut sdf_values_sorted: Vec<f32> = dist.values().copied().collect();
+    sdf_values_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
    let mut skel = zhang_suen_thin(&hull.pixels);
    let spur_len = (sdf_max * 1.5).round() as usize;
    prune_skeleton_spurs(&mut skel, spur_len.max(2));
@@ -562,8 +581,8 @@ fn compute_hull_data(hull: &Hull) -> HullData {
        .map(|&(x, y)| (x as i32, y as i32))
        .collect();
    let skeleton_length = skel.len() as u32;
-    HullData { bx, by, width, height, was_ink, sdf, skel_endpoints,
-               skeleton_length, ink_total: count }
+    HullData { bx, by, width, height, was_ink, sdf, sdf_values_sorted,
+               skel_endpoints, skeleton_length, ink_total: count }
 }

 // ── Coverage grid: per-call mutable state, sized to the hull's bbox ─────
@@ -607,7 +626,13 @@ struct Grid {

 impl Grid {
    fn from_hull(hull: &Hull) -> Self {
-        let h = get_or_compute_hull_data(hull);
+        Self::from_hull_data(get_or_compute_hull_data(hull))
+    }
+
+    /// Construct a Grid from an already-fetched HullData. Lets the
+    /// caller use the same Arc<HullData> for cheap SDF-percentile
+    /// lookup AND for the Grid, avoiding two cache lookups per call.
+    fn from_hull_data(h: Arc<HullData>) -> Self {
        let unpainted = h.was_ink.clone();
        let ink_total = h.ink_total;
        let bx = h.bx; let by = h.by;
@@ -1160,12 +1185,11 @@ pub fn paint_fill_with(hull: &Hull, params: &PaintParams) -> FillResult {
    if hull.pixels.is_empty() {
        return FillResult { hull_id: hull.id, strokes: vec![] };
    }
-    let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
-    let dist = chamfer_distance(hull, &pixel_set);
-    let effective_sdf = sdf_percentile(&dist, params.brush_radius_percentile).max(0.5);
+    let h = get_or_compute_hull_data(hull);
+    let effective_sdf = h.sdf_percentile_q(params.brush_radius_percentile).max(0.5);
    let brush_radius = params.brush_radius_factor * effective_sdf + params.brush_radius_offset_px;

-    let mut grid = Grid::from_hull(hull);
+    let mut grid = Grid::from_hull_data(h);
    grid.set_brush(brush_radius);
    let mut strokes: Vec<Vec<(f32, f32)>> = Vec::new();

@@ -1240,12 +1264,11 @@ pub struct PaintMetrics {
    pub brush_radius:   f32,
 }

-/// Compute metrics by running paint_fill_debug. This gives an
-/// authoritative `ink_unpainted` (paint_fill_with stamps single disks
-/// for sub-threshold components, which don't appear in the returned
-/// stroke geometry — replaying strokes alone overcounts unpainted ink).
+/// Compute metrics by running the painter. Skips walk-trace
+/// recording and PNG rendering — both are debug-viewer-only and
+/// add ~25% overhead to the optimizer's hot loop.
 pub fn metrics_for(hull: &Hull, params: &PaintParams) -> (FillResult, PaintMetrics) {
-    let dbg = paint_fill_debug(hull, params);
+    let dbg = paint_fill_debug_inner(hull, params, false, false);
    let strokes = dbg.strokes.iter().filter(|s| s.len() >= 2).cloned().collect::<Vec<_>>();
    let total_length: f32 = strokes.iter().map(|s| {
        s.windows(2).map(|w| {
@@ -1507,18 +1530,25 @@ pub fn score_weighted(m: &PaintMetrics, w: ScoreWeights) -> f32 {
  - w.brush_size        * m.brush_radius
 }

-pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
+/// Internal: do the painting and produce a fully-populated PaintDebug.
+/// `record_walks` enables the WalkTrace step recording (heavy — also
+/// triggers per-candidate breakdown work in walk_brush). `render_pngs`
+/// enables base64 PNG encoding for the frontend overlays. Both
+/// default-off paths are taken by `metrics_for`, the optimizer's
+/// per-call entry, where neither output is read — that path runs
+/// noticeably faster as a result.
+fn paint_fill_debug_inner(hull: &Hull, params: &PaintParams,
+                          record_walks: bool, render_pngs: bool) -> PaintDebug {
    let bounds = [
        hull.bounds.x_min as f32, hull.bounds.y_min as f32,
        hull.bounds.x_max as f32, hull.bounds.y_max as f32,
    ];
-    let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
-    let dist = chamfer_distance(hull, &pixel_set);
-    let sdf_max = dist.values().cloned().fold(0.0_f32, f32::max).max(0.5);
-    let effective_sdf = sdf_percentile(&dist, params.brush_radius_percentile).max(0.5);
+    let h = get_or_compute_hull_data(hull);
+    let sdf_max = h.sdf_values_sorted.last().copied().unwrap_or(0.0).max(0.5);
+    let effective_sdf = h.sdf_percentile_q(params.brush_radius_percentile).max(0.5);
    let brush_radius = params.brush_radius_factor * effective_sdf + params.brush_radius_offset_px;

-    let mut grid = Grid::from_hull(hull);
+    let mut grid = Grid::from_hull_data(h);
    grid.set_brush(brush_radius);
    let mut trajectories: Vec<Vec<(f32, f32)>> = Vec::new();
    let mut starts: Vec<(f32, f32)> = Vec::new();
@@ -1532,8 +1562,9 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
        let start = match grid.pick_next_component(min_component_pixels) {
            Some(s) => s, None => break,
        };
+        let walk_log = if record_walks { Some(&mut walks) } else { None };
        let path = trace_stroke(start, &mut grid, params, brush_radius,
-                                Some(&mut walks), stroke_idx);
+                                walk_log, stroke_idx);
        if path.len() >= 2 {
            // Record path[0] as the "start" — that's where the gcode
            // pen actually comes down.
@@ -1551,18 +1582,22 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
        .filter(|s| s.len() >= 2)
        .collect();

-    let (sdf_b64, _) = encode_sdf_b64(hull);
    let ink_unpainted = grid.ink_remaining.max(0) as u32;
    let (bg_painted, total_swept, repaint) = measure_sweep_full(&strokes, &grid);
    let skeleton_length = grid.skeleton_length;
    let unpainted_clusters = grid.unpainted_cluster_sizes();
+    let (source_b64, sdf_b64, coverage_b64) = if render_pngs {
+        (encode_hull_pixels_b64(hull), encode_sdf_b64(hull).0, encode_coverage_b64(&grid))
+    } else {
+        (String::new(), String::new(), String::new())
+    };
    PaintDebug {
        bounds,
-        source_b64:   encode_hull_pixels_b64(hull),
+        source_b64,
        sdf_b64,
        sdf_max,
        brush_radius,
-        coverage_b64: encode_coverage_b64(&grid),
+        coverage_b64,
        ink_total:     grid.ink_total.max(0) as u32,
        ink_unpainted,
        bg_painted,
@@ -1577,6 +1612,10 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
    }
 }

+pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
+    paint_fill_debug_inner(hull, params, true, true)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;