brush-paint: fix optimizer hot-path leaks discovered via samply profile

Profiled `evaluate(corpus, &default)` (the per-call optimizer entry)
under samply for 60s. Two big leaks both around `paint_fill_debug`,
which `metrics_for` was calling unconditionally:

1. SDF was being recomputed in `paint_fill_with` and
   `paint_fill_debug` outside the hull cache, just to derive
   brush_radius from a percentile lookup. Fix: cache the sorted
   chamfer values in HullData so `sdf_percentile_q(q)` is O(1).
   Both functions now go through `Grid::from_hull_data(h)` using
   the same Arc<HullData> they used for the percentile.

2. `paint_fill_debug` always rendered three base64-encoded PNGs
   (~23% of CPU was in flate2's ZlibEncoder) and recorded full
   per-step WalkTrace lists, even when called by `metrics_for`
   which discards both. Fix: split into a private inner function
   with `record_walks` / `render_pngs` flags; the public
   `paint_fill_debug` keeps both on, `metrics_for` turns both off.

Bench: `evaluate(corpus, &default)` × 1700 iters

  before:  52 ms/iter
  after:   35 ms/iter   →  1.49× faster

Bit-exact w.r.t. the alphabet report.

Also adds `src/bin/paint_bench.rs` — a small fixed-duration loop
over `evaluate` for profiling under samply / Instruments.
This commit is contained in:
Mitchell Hansen
2026-05-07 16:51:05 -07:00
parent 34ee79f543
commit 8043254593
2 changed files with 101 additions and 22 deletions

40
src/bin/paint_bench.rs Normal file
View File

@@ -0,0 +1,40 @@
//! Hot-path bench: build the optimizer corpus once, then loop calling
//! `evaluate(corpus, &default_params)` for a fixed wall-clock duration.
//! Prints iter count + ms/iter so you have a baseline number, and
//! holds the process up long enough that an external profiler
//! (samply, sample, Instruments) can capture a representative trace.
//!
//! Usage: paint_bench [seconds] (default 60)
use std::time::{Duration, Instant};
use trac3r_lib::brush_paint::PaintParams;
use trac3r_lib::brush_paint_opt::{build_corpus, evaluate};
fn main() {
let secs: u64 = std::env::args().nth(1)
.and_then(|s| s.parse().ok())
.unwrap_or(60);
eprintln!("[bench] building corpus...");
let corpus = build_corpus();
eprintln!("[bench] corpus: {} hulls", corpus.len());
let params = PaintParams::default();
eprintln!("[bench] pid={} running for {}s", std::process::id(), secs);
// Warm up the hull cache + jit any lazy code paths.
let _ = evaluate(&corpus, &params);
let deadline = Instant::now() + Duration::from_secs(secs);
let mut iters = 0u32;
let start = Instant::now();
while Instant::now() < deadline {
let _ = evaluate(&corpus, &params);
iters += 1;
if iters.is_multiple_of(10) {
let elapsed = start.elapsed().as_secs_f64();
eprintln!("[bench] {iters} iters, {:.0} ms/iter", 1000.0 * elapsed / iters as f64);
}
}
let elapsed = start.elapsed().as_secs_f64();
eprintln!("[bench] DONE: {iters} iters in {:.1}s = {:.0} ms/iter",
elapsed, 1000.0 * elapsed / iters as f64);
}

View File

@@ -478,11 +478,28 @@ struct HullData {
width: i32, height: i32, width: i32, height: i32,
was_ink: BitMask, was_ink: BitMask,
sdf: Vec<f32>, sdf: Vec<f32>,
/// Sorted chamfer-distance values for the ink pixels (the same set
/// `chamfer_distance` returns). Lets `sdf_percentile_q(q)` answer
/// in O(1) instead of recomputing chamfer + sort. Critical for
/// the optimizer hot path: `paint_fill_with` needs an SDF
/// percentile to derive `brush_radius` and was redundantly
/// recomputing chamfer per call.
sdf_values_sorted: Vec<f32>,
skel_endpoints: Vec<(i32, i32)>, skel_endpoints: Vec<(i32, i32)>,
skeleton_length: u32, skeleton_length: u32,
ink_total: i32, ink_total: i32,
} }
impl HullData {
fn sdf_percentile_q(&self, q: f32) -> f32 {
let v = &self.sdf_values_sorted;
if v.is_empty() { return 0.0; }
let q = q.clamp(0.0, 1.0);
let idx = ((v.len() as f32 - 1.0) * q).round() as usize;
v[idx.min(v.len() - 1)]
}
}
/// Cache key. `hull.id` alone isn't enough — extract_hulls assigns /// Cache key. `hull.id` alone isn't enough — extract_hulls assigns
/// IDs from a per-call counter, so distinct hulls from different /// IDs from a per-call counter, so distinct hulls from different
/// rasterizations collide on id. Mirror-image letters (p/q at the /// rasterizations collide on id. Mirror-image letters (p/q at the
@@ -554,6 +571,8 @@ fn compute_hull_data(hull: &Hull) -> HullData {
sdf[(ly * width + lx) as usize] = d; sdf[(ly * width + lx) as usize] = d;
} }
let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5); let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
let mut sdf_values_sorted: Vec<f32> = dist.values().copied().collect();
sdf_values_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mut skel = zhang_suen_thin(&hull.pixels); let mut skel = zhang_suen_thin(&hull.pixels);
let spur_len = (sdf_max * 1.5).round() as usize; let spur_len = (sdf_max * 1.5).round() as usize;
prune_skeleton_spurs(&mut skel, spur_len.max(2)); prune_skeleton_spurs(&mut skel, spur_len.max(2));
@@ -562,8 +581,8 @@ fn compute_hull_data(hull: &Hull) -> HullData {
.map(|&(x, y)| (x as i32, y as i32)) .map(|&(x, y)| (x as i32, y as i32))
.collect(); .collect();
let skeleton_length = skel.len() as u32; let skeleton_length = skel.len() as u32;
HullData { bx, by, width, height, was_ink, sdf, skel_endpoints, HullData { bx, by, width, height, was_ink, sdf, sdf_values_sorted,
skeleton_length, ink_total: count } skel_endpoints, skeleton_length, ink_total: count }
} }
// ── Coverage grid: per-call mutable state, sized to the hull's bbox ───── // ── Coverage grid: per-call mutable state, sized to the hull's bbox ─────
@@ -607,7 +626,13 @@ struct Grid {
impl Grid { impl Grid {
fn from_hull(hull: &Hull) -> Self { fn from_hull(hull: &Hull) -> Self {
let h = get_or_compute_hull_data(hull); Self::from_hull_data(get_or_compute_hull_data(hull))
}
/// Construct a Grid from an already-fetched HullData. Lets the
/// caller use the same Arc<HullData> for cheap SDF-percentile
/// lookup AND for the Grid, avoiding two cache lookups per call.
fn from_hull_data(h: Arc<HullData>) -> Self {
let unpainted = h.was_ink.clone(); let unpainted = h.was_ink.clone();
let ink_total = h.ink_total; let ink_total = h.ink_total;
let bx = h.bx; let by = h.by; let bx = h.bx; let by = h.by;
@@ -1160,12 +1185,11 @@ pub fn paint_fill_with(hull: &Hull, params: &PaintParams) -> FillResult {
if hull.pixels.is_empty() { if hull.pixels.is_empty() {
return FillResult { hull_id: hull.id, strokes: vec![] }; return FillResult { hull_id: hull.id, strokes: vec![] };
} }
let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect(); let h = get_or_compute_hull_data(hull);
let dist = chamfer_distance(hull, &pixel_set); let effective_sdf = h.sdf_percentile_q(params.brush_radius_percentile).max(0.5);
let effective_sdf = sdf_percentile(&dist, params.brush_radius_percentile).max(0.5);
let brush_radius = params.brush_radius_factor * effective_sdf + params.brush_radius_offset_px; let brush_radius = params.brush_radius_factor * effective_sdf + params.brush_radius_offset_px;
let mut grid = Grid::from_hull(hull); let mut grid = Grid::from_hull_data(h);
grid.set_brush(brush_radius); grid.set_brush(brush_radius);
let mut strokes: Vec<Vec<(f32, f32)>> = Vec::new(); let mut strokes: Vec<Vec<(f32, f32)>> = Vec::new();
@@ -1240,12 +1264,11 @@ pub struct PaintMetrics {
pub brush_radius: f32, pub brush_radius: f32,
} }
/// Compute metrics by running paint_fill_debug. This gives an /// Compute metrics by running the painter. Skips walk-trace
/// authoritative `ink_unpainted` (paint_fill_with stamps single disks /// recording and PNG rendering — both are debug-viewer-only and
/// for sub-threshold components, which don't appear in the returned /// add ~25% overhead to the optimizer's hot loop.
/// stroke geometry — replaying strokes alone overcounts unpainted ink).
pub fn metrics_for(hull: &Hull, params: &PaintParams) -> (FillResult, PaintMetrics) { pub fn metrics_for(hull: &Hull, params: &PaintParams) -> (FillResult, PaintMetrics) {
let dbg = paint_fill_debug(hull, params); let dbg = paint_fill_debug_inner(hull, params, false, false);
let strokes = dbg.strokes.iter().filter(|s| s.len() >= 2).cloned().collect::<Vec<_>>(); let strokes = dbg.strokes.iter().filter(|s| s.len() >= 2).cloned().collect::<Vec<_>>();
let total_length: f32 = strokes.iter().map(|s| { let total_length: f32 = strokes.iter().map(|s| {
s.windows(2).map(|w| { s.windows(2).map(|w| {
@@ -1507,18 +1530,25 @@ pub fn score_weighted(m: &PaintMetrics, w: ScoreWeights) -> f32 {
- w.brush_size * m.brush_radius - w.brush_size * m.brush_radius
} }
pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug { /// Internal: do the painting and produce a fully-populated PaintDebug.
/// `record_walks` enables the WalkTrace step recording (heavy — also
/// triggers per-candidate breakdown work in walk_brush). `render_pngs`
/// enables base64 PNG encoding for the frontend overlays. Both
/// default-off paths are taken by `metrics_for`, the optimizer's
/// per-call entry, where neither output is read — that path runs
/// noticeably faster as a result.
fn paint_fill_debug_inner(hull: &Hull, params: &PaintParams,
record_walks: bool, render_pngs: bool) -> PaintDebug {
let bounds = [ let bounds = [
hull.bounds.x_min as f32, hull.bounds.y_min as f32, hull.bounds.x_min as f32, hull.bounds.y_min as f32,
hull.bounds.x_max as f32, hull.bounds.y_max as f32, hull.bounds.x_max as f32, hull.bounds.y_max as f32,
]; ];
let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect(); let h = get_or_compute_hull_data(hull);
let dist = chamfer_distance(hull, &pixel_set); let sdf_max = h.sdf_values_sorted.last().copied().unwrap_or(0.0).max(0.5);
let sdf_max = dist.values().cloned().fold(0.0_f32, f32::max).max(0.5); let effective_sdf = h.sdf_percentile_q(params.brush_radius_percentile).max(0.5);
let effective_sdf = sdf_percentile(&dist, params.brush_radius_percentile).max(0.5);
let brush_radius = params.brush_radius_factor * effective_sdf + params.brush_radius_offset_px; let brush_radius = params.brush_radius_factor * effective_sdf + params.brush_radius_offset_px;
let mut grid = Grid::from_hull(hull); let mut grid = Grid::from_hull_data(h);
grid.set_brush(brush_radius); grid.set_brush(brush_radius);
let mut trajectories: Vec<Vec<(f32, f32)>> = Vec::new(); let mut trajectories: Vec<Vec<(f32, f32)>> = Vec::new();
let mut starts: Vec<(f32, f32)> = Vec::new(); let mut starts: Vec<(f32, f32)> = Vec::new();
@@ -1532,8 +1562,9 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
let start = match grid.pick_next_component(min_component_pixels) { let start = match grid.pick_next_component(min_component_pixels) {
Some(s) => s, None => break, Some(s) => s, None => break,
}; };
let walk_log = if record_walks { Some(&mut walks) } else { None };
let path = trace_stroke(start, &mut grid, params, brush_radius, let path = trace_stroke(start, &mut grid, params, brush_radius,
Some(&mut walks), stroke_idx); walk_log, stroke_idx);
if path.len() >= 2 { if path.len() >= 2 {
// Record path[0] as the "start" — that's where the gcode // Record path[0] as the "start" — that's where the gcode
// pen actually comes down. // pen actually comes down.
@@ -1551,18 +1582,22 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
.filter(|s| s.len() >= 2) .filter(|s| s.len() >= 2)
.collect(); .collect();
let (sdf_b64, _) = encode_sdf_b64(hull);
let ink_unpainted = grid.ink_remaining.max(0) as u32; let ink_unpainted = grid.ink_remaining.max(0) as u32;
let (bg_painted, total_swept, repaint) = measure_sweep_full(&strokes, &grid); let (bg_painted, total_swept, repaint) = measure_sweep_full(&strokes, &grid);
let skeleton_length = grid.skeleton_length; let skeleton_length = grid.skeleton_length;
let unpainted_clusters = grid.unpainted_cluster_sizes(); let unpainted_clusters = grid.unpainted_cluster_sizes();
let (source_b64, sdf_b64, coverage_b64) = if render_pngs {
(encode_hull_pixels_b64(hull), encode_sdf_b64(hull).0, encode_coverage_b64(&grid))
} else {
(String::new(), String::new(), String::new())
};
PaintDebug { PaintDebug {
bounds, bounds,
source_b64: encode_hull_pixels_b64(hull), source_b64,
sdf_b64, sdf_b64,
sdf_max, sdf_max,
brush_radius, brush_radius,
coverage_b64: encode_coverage_b64(&grid), coverage_b64,
ink_total: grid.ink_total.max(0) as u32, ink_total: grid.ink_total.max(0) as u32,
ink_unpainted, ink_unpainted,
bg_painted, bg_painted,
@@ -1577,6 +1612,10 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
} }
} }
pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
paint_fill_debug_inner(hull, params, true, true)
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;