brush-paint: fix optimizer hot-path leaks discovered via samply profile
Profiled `evaluate(corpus, &default)` (the per-call optimizer entry) under samply for 60s. Two big leaks both around `paint_fill_debug`, which `metrics_for` was calling unconditionally: 1. SDF was being recomputed in `paint_fill_with` and `paint_fill_debug` outside the hull cache, just to derive brush_radius from a percentile lookup. Fix: cache the sorted chamfer values in HullData so `sdf_percentile_q(q)` is O(1). Both functions now go through `Grid::from_hull_data(h)` using the same Arc<HullData> they used for the percentile. 2. `paint_fill_debug` always rendered three base64-encoded PNGs (~23% of CPU was in flate2's ZlibEncoder) and recorded full per-step WalkTrace lists, even when called by `metrics_for` which discards both. Fix: split into a private inner function with `record_walks` / `render_pngs` flags; the public `paint_fill_debug` keeps both on, `metrics_for` turns both off. Bench: `evaluate(corpus, &default)` × 1700 iters before: 52 ms/iter after: 35 ms/iter → 1.49× faster Bit-exact w.r.t. the alphabet report. Also adds `src/bin/paint_bench.rs` — a small fixed-duration loop over `evaluate` for profiling under samply / Instruments.
This commit is contained in:
40
src/bin/paint_bench.rs
Normal file
40
src/bin/paint_bench.rs
Normal file
@@ -0,0 +1,40 @@
|
||||
//! Hot-path bench: build the optimizer corpus once, then loop calling
|
||||
//! `evaluate(corpus, &default_params)` for a fixed wall-clock duration.
|
||||
//! Prints iter count + ms/iter so you have a baseline number, and
|
||||
//! holds the process up long enough that an external profiler
|
||||
//! (samply, sample, Instruments) can capture a representative trace.
|
||||
//!
|
||||
//! Usage: paint_bench [seconds] (default 60)
|
||||
|
||||
use std::time::{Duration, Instant};
|
||||
use trac3r_lib::brush_paint::PaintParams;
|
||||
use trac3r_lib::brush_paint_opt::{build_corpus, evaluate};
|
||||
|
||||
fn main() {
|
||||
let secs: u64 = std::env::args().nth(1)
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(60);
|
||||
eprintln!("[bench] building corpus...");
|
||||
let corpus = build_corpus();
|
||||
eprintln!("[bench] corpus: {} hulls", corpus.len());
|
||||
let params = PaintParams::default();
|
||||
eprintln!("[bench] pid={} running for {}s", std::process::id(), secs);
|
||||
|
||||
// Warm up the hull cache + jit any lazy code paths.
|
||||
let _ = evaluate(&corpus, ¶ms);
|
||||
|
||||
let deadline = Instant::now() + Duration::from_secs(secs);
|
||||
let mut iters = 0u32;
|
||||
let start = Instant::now();
|
||||
while Instant::now() < deadline {
|
||||
let _ = evaluate(&corpus, ¶ms);
|
||||
iters += 1;
|
||||
if iters.is_multiple_of(10) {
|
||||
let elapsed = start.elapsed().as_secs_f64();
|
||||
eprintln!("[bench] {iters} iters, {:.0} ms/iter", 1000.0 * elapsed / iters as f64);
|
||||
}
|
||||
}
|
||||
let elapsed = start.elapsed().as_secs_f64();
|
||||
eprintln!("[bench] DONE: {iters} iters in {:.1}s = {:.0} ms/iter",
|
||||
elapsed, 1000.0 * elapsed / iters as f64);
|
||||
}
|
||||
@@ -478,11 +478,28 @@ struct HullData {
|
||||
width: i32, height: i32,
|
||||
was_ink: BitMask,
|
||||
sdf: Vec<f32>,
|
||||
/// Sorted chamfer-distance values for the ink pixels (the same set
|
||||
/// `chamfer_distance` returns). Lets `sdf_percentile_q(q)` answer
|
||||
/// in O(1) instead of recomputing chamfer + sort. Critical for
|
||||
/// the optimizer hot path: `paint_fill_with` needs an SDF
|
||||
/// percentile to derive `brush_radius` and was redundantly
|
||||
/// recomputing chamfer per call.
|
||||
sdf_values_sorted: Vec<f32>,
|
||||
skel_endpoints: Vec<(i32, i32)>,
|
||||
skeleton_length: u32,
|
||||
ink_total: i32,
|
||||
}
|
||||
|
||||
impl HullData {
|
||||
fn sdf_percentile_q(&self, q: f32) -> f32 {
|
||||
let v = &self.sdf_values_sorted;
|
||||
if v.is_empty() { return 0.0; }
|
||||
let q = q.clamp(0.0, 1.0);
|
||||
let idx = ((v.len() as f32 - 1.0) * q).round() as usize;
|
||||
v[idx.min(v.len() - 1)]
|
||||
}
|
||||
}
|
||||
|
||||
/// Cache key. `hull.id` alone isn't enough — extract_hulls assigns
|
||||
/// IDs from a per-call counter, so distinct hulls from different
|
||||
/// rasterizations collide on id. Mirror-image letters (p/q at the
|
||||
@@ -554,6 +571,8 @@ fn compute_hull_data(hull: &Hull) -> HullData {
|
||||
sdf[(ly * width + lx) as usize] = d;
|
||||
}
|
||||
let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
|
||||
let mut sdf_values_sorted: Vec<f32> = dist.values().copied().collect();
|
||||
sdf_values_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||||
let mut skel = zhang_suen_thin(&hull.pixels);
|
||||
let spur_len = (sdf_max * 1.5).round() as usize;
|
||||
prune_skeleton_spurs(&mut skel, spur_len.max(2));
|
||||
@@ -562,8 +581,8 @@ fn compute_hull_data(hull: &Hull) -> HullData {
|
||||
.map(|&(x, y)| (x as i32, y as i32))
|
||||
.collect();
|
||||
let skeleton_length = skel.len() as u32;
|
||||
HullData { bx, by, width, height, was_ink, sdf, skel_endpoints,
|
||||
skeleton_length, ink_total: count }
|
||||
HullData { bx, by, width, height, was_ink, sdf, sdf_values_sorted,
|
||||
skel_endpoints, skeleton_length, ink_total: count }
|
||||
}
|
||||
|
||||
// ── Coverage grid: per-call mutable state, sized to the hull's bbox ─────
|
||||
@@ -607,7 +626,13 @@ struct Grid {
|
||||
|
||||
impl Grid {
|
||||
fn from_hull(hull: &Hull) -> Self {
|
||||
let h = get_or_compute_hull_data(hull);
|
||||
Self::from_hull_data(get_or_compute_hull_data(hull))
|
||||
}
|
||||
|
||||
/// Construct a Grid from an already-fetched HullData. Lets the
|
||||
/// caller use the same Arc<HullData> for cheap SDF-percentile
|
||||
/// lookup AND for the Grid, avoiding two cache lookups per call.
|
||||
fn from_hull_data(h: Arc<HullData>) -> Self {
|
||||
let unpainted = h.was_ink.clone();
|
||||
let ink_total = h.ink_total;
|
||||
let bx = h.bx; let by = h.by;
|
||||
@@ -1160,12 +1185,11 @@ pub fn paint_fill_with(hull: &Hull, params: &PaintParams) -> FillResult {
|
||||
if hull.pixels.is_empty() {
|
||||
return FillResult { hull_id: hull.id, strokes: vec![] };
|
||||
}
|
||||
let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
|
||||
let dist = chamfer_distance(hull, &pixel_set);
|
||||
let effective_sdf = sdf_percentile(&dist, params.brush_radius_percentile).max(0.5);
|
||||
let h = get_or_compute_hull_data(hull);
|
||||
let effective_sdf = h.sdf_percentile_q(params.brush_radius_percentile).max(0.5);
|
||||
let brush_radius = params.brush_radius_factor * effective_sdf + params.brush_radius_offset_px;
|
||||
|
||||
let mut grid = Grid::from_hull(hull);
|
||||
let mut grid = Grid::from_hull_data(h);
|
||||
grid.set_brush(brush_radius);
|
||||
let mut strokes: Vec<Vec<(f32, f32)>> = Vec::new();
|
||||
|
||||
@@ -1240,12 +1264,11 @@ pub struct PaintMetrics {
|
||||
pub brush_radius: f32,
|
||||
}
|
||||
|
||||
/// Compute metrics by running paint_fill_debug. This gives an
|
||||
/// authoritative `ink_unpainted` (paint_fill_with stamps single disks
|
||||
/// for sub-threshold components, which don't appear in the returned
|
||||
/// stroke geometry — replaying strokes alone overcounts unpainted ink).
|
||||
/// Compute metrics by running the painter. Skips walk-trace
|
||||
/// recording and PNG rendering — both are debug-viewer-only and
|
||||
/// add ~25% overhead to the optimizer's hot loop.
|
||||
pub fn metrics_for(hull: &Hull, params: &PaintParams) -> (FillResult, PaintMetrics) {
|
||||
let dbg = paint_fill_debug(hull, params);
|
||||
let dbg = paint_fill_debug_inner(hull, params, false, false);
|
||||
let strokes = dbg.strokes.iter().filter(|s| s.len() >= 2).cloned().collect::<Vec<_>>();
|
||||
let total_length: f32 = strokes.iter().map(|s| {
|
||||
s.windows(2).map(|w| {
|
||||
@@ -1507,18 +1530,25 @@ pub fn score_weighted(m: &PaintMetrics, w: ScoreWeights) -> f32 {
|
||||
- w.brush_size * m.brush_radius
|
||||
}
|
||||
|
||||
pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
|
||||
/// Internal: do the painting and produce a fully-populated PaintDebug.
|
||||
/// `record_walks` enables the WalkTrace step recording (heavy — also
|
||||
/// triggers per-candidate breakdown work in walk_brush). `render_pngs`
|
||||
/// enables base64 PNG encoding for the frontend overlays. Both
|
||||
/// default-off paths are taken by `metrics_for`, the optimizer's
|
||||
/// per-call entry, where neither output is read — that path runs
|
||||
/// noticeably faster as a result.
|
||||
fn paint_fill_debug_inner(hull: &Hull, params: &PaintParams,
|
||||
record_walks: bool, render_pngs: bool) -> PaintDebug {
|
||||
let bounds = [
|
||||
hull.bounds.x_min as f32, hull.bounds.y_min as f32,
|
||||
hull.bounds.x_max as f32, hull.bounds.y_max as f32,
|
||||
];
|
||||
let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
|
||||
let dist = chamfer_distance(hull, &pixel_set);
|
||||
let sdf_max = dist.values().cloned().fold(0.0_f32, f32::max).max(0.5);
|
||||
let effective_sdf = sdf_percentile(&dist, params.brush_radius_percentile).max(0.5);
|
||||
let h = get_or_compute_hull_data(hull);
|
||||
let sdf_max = h.sdf_values_sorted.last().copied().unwrap_or(0.0).max(0.5);
|
||||
let effective_sdf = h.sdf_percentile_q(params.brush_radius_percentile).max(0.5);
|
||||
let brush_radius = params.brush_radius_factor * effective_sdf + params.brush_radius_offset_px;
|
||||
|
||||
let mut grid = Grid::from_hull(hull);
|
||||
let mut grid = Grid::from_hull_data(h);
|
||||
grid.set_brush(brush_radius);
|
||||
let mut trajectories: Vec<Vec<(f32, f32)>> = Vec::new();
|
||||
let mut starts: Vec<(f32, f32)> = Vec::new();
|
||||
@@ -1532,8 +1562,9 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
|
||||
let start = match grid.pick_next_component(min_component_pixels) {
|
||||
Some(s) => s, None => break,
|
||||
};
|
||||
let walk_log = if record_walks { Some(&mut walks) } else { None };
|
||||
let path = trace_stroke(start, &mut grid, params, brush_radius,
|
||||
Some(&mut walks), stroke_idx);
|
||||
walk_log, stroke_idx);
|
||||
if path.len() >= 2 {
|
||||
// Record path[0] as the "start" — that's where the gcode
|
||||
// pen actually comes down.
|
||||
@@ -1551,18 +1582,22 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
|
||||
.filter(|s| s.len() >= 2)
|
||||
.collect();
|
||||
|
||||
let (sdf_b64, _) = encode_sdf_b64(hull);
|
||||
let ink_unpainted = grid.ink_remaining.max(0) as u32;
|
||||
let (bg_painted, total_swept, repaint) = measure_sweep_full(&strokes, &grid);
|
||||
let skeleton_length = grid.skeleton_length;
|
||||
let unpainted_clusters = grid.unpainted_cluster_sizes();
|
||||
let (source_b64, sdf_b64, coverage_b64) = if render_pngs {
|
||||
(encode_hull_pixels_b64(hull), encode_sdf_b64(hull).0, encode_coverage_b64(&grid))
|
||||
} else {
|
||||
(String::new(), String::new(), String::new())
|
||||
};
|
||||
PaintDebug {
|
||||
bounds,
|
||||
source_b64: encode_hull_pixels_b64(hull),
|
||||
source_b64,
|
||||
sdf_b64,
|
||||
sdf_max,
|
||||
brush_radius,
|
||||
coverage_b64: encode_coverage_b64(&grid),
|
||||
coverage_b64,
|
||||
ink_total: grid.ink_total.max(0) as u32,
|
||||
ink_unpainted,
|
||||
bg_painted,
|
||||
@@ -1577,6 +1612,10 @@ pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn paint_fill_debug(hull: &Hull, params: &PaintParams) -> PaintDebug {
|
||||
paint_fill_debug_inner(hull, params, true, true)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
Reference in New Issue
Block a user