From 34ee79f54326c2bc1aa49b0e340e39e639d408b2 Mon Sep 17 00:00:00 2001 From: Mitchell Hansen Date: Thu, 7 May 2026 00:50:58 -0700 Subject: [PATCH] brush-paint: cache hull-derived data (SDF + skeleton + was_ink) per hull The dominant cost of one paint_fill_with call (~50% wall time at small radii) is computing the chamfer-3-4 SDF and Zhang-Suen skeleton, both pure functions of the hull. The optimizer calls paint_fill_with thousands of times per hull while only varying brush/walker params, so this work is fully redundant across calls. New: a process-global Mutex>> cache keyed by FNV-1a fingerprint of the hull's pixel coordinates. First call computes; subsequent calls hand back an Arc in O(N) fingerprint hash + O(1) lookup. Grid now holds Arc for the immutable hull-derived state (was_ink, sdf, skel_endpoints) and clones only the mutable `unpainted` mask per call. Bbox + skeleton_length are duplicated into Grid so the disk-iteration hot path doesn't pay an Arc deref. Bit-exact w.r.t. the alphabet report. Expected speedup is largest in the optimizer's tight loop (many calls per hull); the alphabet report only paints each hull once so most of its wins were from the prior precompute/bitset commits. --- src/brush_paint.rs | 209 +++++++++++++++++++++++++++++---------------- 1 file changed, 137 insertions(+), 72 deletions(-) diff --git a/src/brush_paint.rs b/src/brush_paint.rs index 92fcfdfc..08ccdac5 100644 --- a/src/brush_paint.rs +++ b/src/brush_paint.rs @@ -12,7 +12,8 @@ // over direction has unpainted ink ahead, while alternate directions // don't. -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; +use std::sync::{Arc, Mutex, OnceLock}; use rayon::prelude::*; use crate::fill::{FillResult, rdp_simplify_f32, chamfer_distance, zhang_suen_thin, prune_skeleton_spurs, zs_neighbors}; @@ -290,7 +291,7 @@ fn measure_sweep_full(strokes: &[Vec<(f32, f32)>], grid: &Grid) -> (u32, u32, u32) { if strokes.is_empty() { return (0, 0, 0); } - let mut count = vec![0u32; grid.was_ink.len()]; + let mut count = vec![0u32; grid.hull.was_ink.len()]; let r2 = grid.brush_radius_sq; for stroke in strokes { for win in stroke.windows(2) { @@ -322,7 +323,7 @@ fn measure_sweep_full(strokes: &[Vec<(f32, f32)>], grid: &Grid) for (i, &c) in count.iter().enumerate() { if c == 0 { continue; } total += 1; - if !grid.was_ink.get(i) { bg += 1; } + if !grid.hull.was_ink.get(i) { bg += 1; } else { repaint += c - 1; } } (bg, total, repaint) @@ -434,6 +435,7 @@ fn encode_coverage_b64(grid: &Grid) -> String { /// fits L1 nicely, and word-at-a-time popcount is available when /// scanning whole grids. All ops are `#[inline]` since they're called /// from the disk-iteration hot path. +#[derive(Clone)] struct BitMask { bits: Vec, len: usize, @@ -461,26 +463,124 @@ impl BitMask { } } -// ── Coverage grid: bool per pixel, sized to the hull's bbox ───────────── +// ── Hull-derived data: cached per hull.id ─────────────────────────────── -struct Grid { +/// Pure-function-of-the-hull state: the bbox/grid dimensions, ink mask, +/// chamfer SDF, and skeleton-endpoint set. Computing chamfer + +/// Zhang-Suen thin + spur-prune is the dominant cost of one +/// `paint_fill_with` call (~50% wall time at small radii). The +/// optimizer calls `paint_fill_with` thousands of times per hull while +/// only varying brush/walker params, so the result is identical every +/// time. A small `(hull.id) → Arc` cache eliminates the +/// recomputation across calls. +struct HullData { bx: i32, by: i32, width: i32, height: i32, - /// `true` = ink pixel that hasn't been painted yet. - unpainted: BitMask, - /// `true` = pixel was ink in the original glyph (immutable; never - /// changes after construction). Lets relaxation tell "ink" apart from - /// "background" without conflating it with painted state. was_ink: BitMask, - /// Chamfer 3-4 distance / 3 (≈ Euclidean px from boundary). Used to - /// snap raw start points up the gradient onto the medial-axis ridge, - /// so strokes begin at stroke-centerline rather than polygon-edge. sdf: Vec, - /// Skeleton-endpoint pixel positions (degree-1 nodes of the thinned - /// glyph after spur pruning). These are the "legs" — the natural - /// pen-down anchors for a human writing the letter. A closed shape - /// (O, 0, etc.) has zero endpoints. skel_endpoints: Vec<(i32, i32)>, + skeleton_length: u32, + ink_total: i32, +} + +/// Cache key. `hull.id` alone isn't enough — extract_hulls assigns +/// IDs from a per-call counter, so distinct hulls from different +/// rasterizations collide on id. Mirror-image letters (p/q at the +/// same scale) can also share area + bounds. We use a full FNV-1a +/// hash over the pixel coordinate stream as the key — O(N) once +/// per cache miss, but conclusive against collisions. +type HullKey = u64; + +fn hull_key(hull: &Hull) -> HullKey { + let mut h = 0xcbf29ce484222325u64; + for &(x, y) in &hull.pixels { + h ^= x as u64; + h = h.wrapping_mul(0x100000001b3); + h ^= y as u64; + h = h.wrapping_mul(0x100000001b3); + } + h +} + +fn hull_cache() -> &'static Mutex>> { + static CACHE: OnceLock>>> = OnceLock::new(); + CACHE.get_or_init(|| Mutex::new(HashMap::new())) +} + +fn get_or_compute_hull_data(hull: &Hull) -> Arc { + let key = hull_key(hull); + { + let cache = hull_cache().lock().unwrap(); + if let Some(c) = cache.get(&key) { + return c.clone(); + } + } + // Compute outside the lock so concurrent callers for different + // hulls don't serialize. A small race is possible (two threads + // miss for the same hull and both compute) — both produce + // identical data, so the loser's copy is just dropped. + let computed = Arc::new(compute_hull_data(hull)); + let mut cache = hull_cache().lock().unwrap(); + cache.entry(key).or_insert_with(|| computed.clone()).clone() +} + +fn compute_hull_data(hull: &Hull) -> HullData { + // Pad the grid past the hull's AABB so that bg pixels swept by a + // brush that overhangs the polygon (e.g. at the top of an `I`, + // or the corners of a square letter) are counted instead of + // silently dropped by the bounds check. PAD must exceed any + // brush_radius the optimizer might try. + const PAD: i32 = 32; + let bx = hull.bounds.x_min as i32 - PAD; + let by = hull.bounds.y_min as i32 - PAD; + let width = (hull.bounds.x_max as i32 - hull.bounds.x_min as i32 + 1 + 2 * PAD).max(1); + let height = (hull.bounds.y_max as i32 - hull.bounds.y_min as i32 + 1 + 2 * PAD).max(1); + let cells = (width * height) as usize; + let mut was_ink = BitMask::new(cells); + let mut sdf = vec![0.0_f32; cells]; + let mut count = 0; + for &(x, y) in &hull.pixels { + let lx = x as i32 - bx; let ly = y as i32 - by; + if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; } + let idx = (ly * width + lx) as usize; + was_ink.set(idx); + count += 1; + } + let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect(); + let dist = chamfer_distance(hull, &pixel_set); + for (&(x, y), &d) in dist.iter() { + let lx = x as i32 - bx; let ly = y as i32 - by; + if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; } + sdf[(ly * width + lx) as usize] = d; + } + let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5); + let mut skel = zhang_suen_thin(&hull.pixels); + let spur_len = (sdf_max * 1.5).round() as usize; + prune_skeleton_spurs(&mut skel, spur_len.max(2)); + let skel_endpoints: Vec<(i32, i32)> = skel.iter() + .filter(|&&p| zs_neighbors(p.0, p.1).iter().filter(|n| skel.contains(n)).count() == 1) + .map(|&(x, y)| (x as i32, y as i32)) + .collect(); + let skeleton_length = skel.len() as u32; + HullData { bx, by, width, height, was_ink, sdf, skel_endpoints, + skeleton_length, ink_total: count } +} + +// ── Coverage grid: per-call mutable state, sized to the hull's bbox ───── + +struct Grid { + // Bbox is duplicated from `hull` so the disk-iteration hot path + // doesn't pay an Arc deref on every step. + bx: i32, by: i32, + width: i32, height: i32, + /// Cached hull-derived state: was_ink mask, SDF, skeleton + /// endpoints. Shared across all `paint_fill_with` calls on the + /// same hull via Arc — avoids recomputing chamfer + skeleton + /// per call. Read-only from this struct's perspective. + hull: Arc, + /// `true` = ink pixel that hasn't been painted yet. Owned, mutable. + /// Initialized as a clone of `hull.was_ink`. + unpainted: BitMask, /// Approximate medial-axis length, in pixels. Counted as skeleton /// pixel count (each connected skeleton pixel contributes ~1 px to /// the centerline length). Used as the "ideal" path length — a @@ -507,58 +607,23 @@ struct Grid { impl Grid { fn from_hull(hull: &Hull) -> Self { - // Pad the grid past the hull's AABB so that bg pixels swept by a - // brush that overhangs the polygon (e.g. at the top of an `I`, - // or the corners of a square letter) are counted instead of - // silently dropped by the bounds check. PAD must exceed any - // brush_radius the optimizer might try. - const PAD: i32 = 32; - let bx = hull.bounds.x_min as i32 - PAD; - let by = hull.bounds.y_min as i32 - PAD; - let width = (hull.bounds.x_max as i32 - hull.bounds.x_min as i32 + 1 + 2 * PAD).max(1); - let height = (hull.bounds.y_max as i32 - hull.bounds.y_min as i32 + 1 + 2 * PAD).max(1); - let cells = (width * height) as usize; - let mut unpainted = BitMask::new(cells); - let mut was_ink = BitMask::new(cells); - let mut sdf = vec![0.0_f32; cells]; - let mut count = 0; - for &(x, y) in &hull.pixels { - let lx = x as i32 - bx; let ly = y as i32 - by; - if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; } - let idx = (ly * width + lx) as usize; - unpainted.set(idx); - was_ink.set(idx); - count += 1; + let h = get_or_compute_hull_data(hull); + let unpainted = h.was_ink.clone(); + let ink_total = h.ink_total; + let bx = h.bx; let by = h.by; + let width = h.width; let height = h.height; + let skeleton_length = h.skeleton_length; + Self { + bx, by, width, height, + hull: h, + unpainted, + skeleton_length, + ink_total, + ink_remaining: ink_total, + brush_radius: 0.0, + brush_radius_sq: 0.0, + disk_offsets: Vec::new(), } - // Chamfer distance (per-pixel, in approximate Euclidean units) - // for medial-axis snapping in pick_start. - let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect(); - let dist = chamfer_distance(hull, &pixel_set); - for (&(x, y), &d) in dist.iter() { - let lx = x as i32 - bx; let ly = y as i32 - by; - if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; } - sdf[(ly * width + lx) as usize] = d; - } - // Skeleton + endpoint detection. Spur prune length is keyed to the - // glyph's max stroke half-width so tiny notches at thick strokes - // don't masquerade as legs. (zhang_suen + spur_prune is the same - // recipe the topo-stroke and skeleton-fill paths use.) - let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5); - let mut skel = zhang_suen_thin(&hull.pixels); - let spur_len = (sdf_max * 1.5).round() as usize; - prune_skeleton_spurs(&mut skel, spur_len.max(2)); - let skel_endpoints: Vec<(i32, i32)> = skel.iter() - .filter(|&&p| zs_neighbors(p.0, p.1).iter().filter(|n| skel.contains(n)).count() == 1) - .map(|&(x, y)| (x as i32, y as i32)) - .collect(); - // Skeleton length ≈ skeleton pixel count. For an 8-connected - // skeleton this slightly under-counts diagonal segments (sqrt(2) - // each), but it's close enough for a path-budget heuristic. - let skeleton_length = skel.len() as u32; - - Self { bx, by, width, height, unpainted, was_ink, sdf, skel_endpoints, - skeleton_length, ink_total: count, ink_remaining: count, - brush_radius: 0.0, brush_radius_sq: 0.0, disk_offsets: Vec::new() } } /// Configure the disk shape used for evaluate_disk / paint_disk / @@ -591,7 +656,7 @@ impl Grid { fn sdf_at(&self, x: i32, y: i32) -> f32 { let lx = x - self.bx; let ly = y - self.by; if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return 0.0; } - self.sdf[(ly * self.width + lx) as usize] + self.hull.sdf[(ly * self.width + lx) as usize] } /// Snap a raw pixel position onto the medial-axis ridge by greedy @@ -665,7 +730,7 @@ impl Grid { fn is_ink(&self, x: i32, y: i32) -> bool { let lx = x - self.bx; let ly = y - self.by; if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return false; } - self.was_ink.get((ly * self.width + lx) as usize) + self.hull.was_ink.get((ly * self.width + lx) as usize) } /// Returns (new_ink, repaint_ink, bg) — pixel counts under disk(p, r): @@ -695,7 +760,7 @@ impl Grid { let idx = (ly * self.width + lx) as usize; if self.unpainted.get(idx) { new_ink += 1; - } else if self.was_ink.get(idx) { + } else if self.hull.was_ink.get(idx) { repaint_ink += 1; } else { bg += 1; @@ -816,7 +881,7 @@ impl Grid { let (pixels, _) = &components[chosen]; let comp_set: HashSet = pixels.iter().copied().collect(); let mut best_endpoint: Option<(i32, i32)> = None; - for &(ex, ey) in &self.skel_endpoints { + for &(ex, ey) in self.hull.skel_endpoints.iter() { let lx = ex - self.bx; let ly = ey - self.by; if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { continue; } let idx = (ly * self.width + lx) as usize;