brush-paint: cache hull-derived data (SDF + skeleton + was_ink) per hull

The dominant cost of one paint_fill_with call (~50% wall time at small radii) is computing the chamfer-3-4 SDF and Zhang-Suen skeleton, both pure functions of the hull. The optimizer calls paint_fill_with thousands of times per hull while only varying brush/walker params, so this work is fully redundant across calls. New: a process-global Mutex<HashMap<HullKey, Arc<HullData>>> cache keyed by FNV-1a fingerprint of the hull's pixel coordinates. First call computes; subsequent calls hand back an Arc<HullData> in O(N) fingerprint hash + O(1) lookup. Grid now holds Arc<HullData> for the immutable hull-derived state (was_ink, sdf, skel_endpoints) and clones only the mutable `unpainted` mask per call. Bbox + skeleton_length are duplicated into Grid so the disk-iteration hot path doesn't pay an Arc deref. Bit-exact w.r.t. the alphabet report. Expected speedup is largest in the optimizer's tight loop (many calls per hull); the alphabet report only paints each hull once so most of its wins were from the prior precompute/bitset commits.
2026-05-07 00:50:58 -07:00
parent 8cc5b5cc8a
commit 34ee79f543
1 changed files with 137 additions and 72 deletions
--- a/src/brush_paint.rs
+++ b/src/brush_paint.rs
@@ -12,7 +12,8 @@
 // over direction has unpainted ink ahead, while alternate directions
 // don't.
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::sync::{Arc, Mutex, OnceLock};
 use rayon::prelude::*;
 use crate::fill::{FillResult, rdp_simplify_f32, chamfer_distance,
    zhang_suen_thin, prune_skeleton_spurs, zs_neighbors};
@@ -290,7 +291,7 @@ fn measure_sweep_full(strokes: &[Vec<(f32, f32)>], grid: &Grid)
    -> (u32, u32, u32)
 {
    if strokes.is_empty() { return (0, 0, 0); }
-    let mut count = vec![0u32; grid.was_ink.len()];
+    let mut count = vec![0u32; grid.hull.was_ink.len()];
    let r2 = grid.brush_radius_sq;
    for stroke in strokes {
        for win in stroke.windows(2) {
@@ -322,7 +323,7 @@ fn measure_sweep_full(strokes: &[Vec<(f32, f32)>], grid: &Grid)
    for (i, &c) in count.iter().enumerate() {
        if c == 0 { continue; }
        total += 1;
-        if !grid.was_ink.get(i) { bg += 1; }
+        if !grid.hull.was_ink.get(i) { bg += 1; }
        else { repaint += c - 1; }
    }
    (bg, total, repaint)
@@ -434,6 +435,7 @@ fn encode_coverage_b64(grid: &Grid) -> String {
 /// fits L1 nicely, and word-at-a-time popcount is available when
 /// scanning whole grids. All ops are `#[inline]` since they're called
 /// from the disk-iteration hot path.
 #[derive(Clone)]
 struct BitMask {
    bits: Vec<u64>,
    len:  usize,
@@ -461,26 +463,124 @@ impl BitMask {
    }
 }
-// ── Coverage grid: bool per pixel, sized to the hull's bbox ─────────────
+// ── Hull-derived data: cached per hull.id ───────────────────────────────
-struct Grid {
+/// Pure-function-of-the-hull state: the bbox/grid dimensions, ink mask,
 /// chamfer SDF, and skeleton-endpoint set. Computing chamfer +
 /// Zhang-Suen thin + spur-prune is the dominant cost of one
 /// `paint_fill_with` call (~50% wall time at small radii). The
 /// optimizer calls `paint_fill_with` thousands of times per hull while
 /// only varying brush/walker params, so the result is identical every
 /// time. A small `(hull.id) → Arc<HullData>` cache eliminates the
 /// recomputation across calls.
 struct HullData {
    bx: i32, by: i32,
    width: i32, height: i32,
    /// `true` = ink pixel that hasn't been painted yet.
    unpainted: BitMask,
    /// `true` = pixel was ink in the original glyph (immutable; never
    /// changes after construction). Lets relaxation tell "ink" apart from
    /// "background" without conflating it with painted state.
    was_ink: BitMask,
    /// Chamfer 3-4 distance / 3 (≈ Euclidean px from boundary). Used to
    /// snap raw start points up the gradient onto the medial-axis ridge,
    /// so strokes begin at stroke-centerline rather than polygon-edge.
    sdf: Vec<f32>,
    /// Skeleton-endpoint pixel positions (degree-1 nodes of the thinned
    /// glyph after spur pruning). These are the "legs" — the natural
    /// pen-down anchors for a human writing the letter. A closed shape
    /// (O, 0, etc.) has zero endpoints.
    skel_endpoints: Vec<(i32, i32)>,
    skeleton_length: u32,
    ink_total: i32,
 }
 /// Cache key. `hull.id` alone isn't enough — extract_hulls assigns
 /// IDs from a per-call counter, so distinct hulls from different
 /// rasterizations collide on id. Mirror-image letters (p/q at the
 /// same scale) can also share area + bounds. We use a full FNV-1a
 /// hash over the pixel coordinate stream as the key — O(N) once
 /// per cache miss, but conclusive against collisions.
 type HullKey = u64;
 fn hull_key(hull: &Hull) -> HullKey {
    let mut h = 0xcbf29ce484222325u64;
    for &(x, y) in &hull.pixels {
        h ^= x as u64;
        h = h.wrapping_mul(0x100000001b3);
        h ^= y as u64;
        h = h.wrapping_mul(0x100000001b3);
    }
    h
 }
 fn hull_cache() -> &'static Mutex<HashMap<HullKey, Arc<HullData>>> {
    static CACHE: OnceLock<Mutex<HashMap<HullKey, Arc<HullData>>>> = OnceLock::new();
    CACHE.get_or_init(|| Mutex::new(HashMap::new()))
 }
 fn get_or_compute_hull_data(hull: &Hull) -> Arc<HullData> {
    let key = hull_key(hull);
    {
        let cache = hull_cache().lock().unwrap();
        if let Some(c) = cache.get(&key) {
            return c.clone();
        }
    }
    // Compute outside the lock so concurrent callers for different
    // hulls don't serialize. A small race is possible (two threads
    // miss for the same hull and both compute) — both produce
    // identical data, so the loser's copy is just dropped.
    let computed = Arc::new(compute_hull_data(hull));
    let mut cache = hull_cache().lock().unwrap();
    cache.entry(key).or_insert_with(|| computed.clone()).clone()
 }
 fn compute_hull_data(hull: &Hull) -> HullData {
    // Pad the grid past the hull's AABB so that bg pixels swept by a
    // brush that overhangs the polygon (e.g. at the top of an `I`,
    // or the corners of a square letter) are counted instead of
    // silently dropped by the bounds check. PAD must exceed any
    // brush_radius the optimizer might try.
    const PAD: i32 = 32;
    let bx = hull.bounds.x_min as i32 - PAD;
    let by = hull.bounds.y_min as i32 - PAD;
    let width  = (hull.bounds.x_max as i32 - hull.bounds.x_min as i32 + 1 + 2 * PAD).max(1);
    let height = (hull.bounds.y_max as i32 - hull.bounds.y_min as i32 + 1 + 2 * PAD).max(1);
    let cells = (width * height) as usize;
    let mut was_ink = BitMask::new(cells);
    let mut sdf     = vec![0.0_f32; cells];
    let mut count = 0;
    for &(x, y) in &hull.pixels {
        let lx = x as i32 - bx; let ly = y as i32 - by;
        if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
        let idx = (ly * width + lx) as usize;
        was_ink.set(idx);
        count += 1;
    }
    let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
    let dist = chamfer_distance(hull, &pixel_set);
    for (&(x, y), &d) in dist.iter() {
        let lx = x as i32 - bx; let ly = y as i32 - by;
        if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
        sdf[(ly * width + lx) as usize] = d;
    }
    let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
    let mut skel = zhang_suen_thin(&hull.pixels);
    let spur_len = (sdf_max * 1.5).round() as usize;
    prune_skeleton_spurs(&mut skel, spur_len.max(2));
    let skel_endpoints: Vec<(i32, i32)> = skel.iter()
        .filter(|&&p| zs_neighbors(p.0, p.1).iter().filter(|n| skel.contains(n)).count() == 1)
        .map(|&(x, y)| (x as i32, y as i32))
        .collect();
    let skeleton_length = skel.len() as u32;
    HullData { bx, by, width, height, was_ink, sdf, skel_endpoints,
               skeleton_length, ink_total: count }
 }
 // ── Coverage grid: per-call mutable state, sized to the hull's bbox ─────
 struct Grid {
    // Bbox is duplicated from `hull` so the disk-iteration hot path
    // doesn't pay an Arc deref on every step.
    bx: i32, by: i32,
    width: i32, height: i32,
    /// Cached hull-derived state: was_ink mask, SDF, skeleton
    /// endpoints. Shared across all `paint_fill_with` calls on the
    /// same hull via Arc — avoids recomputing chamfer + skeleton
    /// per call. Read-only from this struct's perspective.
    hull: Arc<HullData>,
    /// `true` = ink pixel that hasn't been painted yet. Owned, mutable.
    /// Initialized as a clone of `hull.was_ink`.
    unpainted: BitMask,
    /// Approximate medial-axis length, in pixels. Counted as skeleton
    /// pixel count (each connected skeleton pixel contributes ~1 px to
    /// the centerline length). Used as the "ideal" path length — a
@@ -507,58 +607,23 @@ struct Grid {
 impl Grid {
    fn from_hull(hull: &Hull) -> Self {
-        // Pad the grid past the hull's AABB so that bg pixels swept by a
+        let h = get_or_compute_hull_data(hull);
-        // brush that overhangs the polygon (e.g. at the top of an `I`,
+        let unpainted = h.was_ink.clone();
-        // or the corners of a square letter) are counted instead of
+        let ink_total = h.ink_total;
-        // silently dropped by the bounds check. PAD must exceed any
+        let bx = h.bx; let by = h.by;
-        // brush_radius the optimizer might try.
+        let width = h.width; let height = h.height;
-        const PAD: i32 = 32;
+        let skeleton_length = h.skeleton_length;
-        let bx = hull.bounds.x_min as i32 - PAD;
+        Self {
-        let by = hull.bounds.y_min as i32 - PAD;
+            bx, by, width, height,
-        let width  = (hull.bounds.x_max as i32 - hull.bounds.x_min as i32 + 1 + 2 * PAD).max(1);
+            hull: h,
-        let height = (hull.bounds.y_max as i32 - hull.bounds.y_min as i32 + 1 + 2 * PAD).max(1);
+            unpainted,
-        let cells = (width * height) as usize;
+            skeleton_length,
-        let mut unpainted = BitMask::new(cells);
+            ink_total,
-        let mut was_ink   = BitMask::new(cells);
+            ink_remaining: ink_total,
-        let mut sdf       = vec![0.0_f32; cells];
+            brush_radius: 0.0,
-        let mut count = 0;
+            brush_radius_sq: 0.0,
-        for &(x, y) in &hull.pixels {
+            disk_offsets: Vec::new(),
            let lx = x as i32 - bx; let ly = y as i32 - by;
            if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
            let idx = (ly * width + lx) as usize;
            unpainted.set(idx);
            was_ink.set(idx);
            count += 1;
        }
        // Chamfer distance (per-pixel, in approximate Euclidean units)
        // for medial-axis snapping in pick_start.
        let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
        let dist = chamfer_distance(hull, &pixel_set);
        for (&(x, y), &d) in dist.iter() {
            let lx = x as i32 - bx; let ly = y as i32 - by;
            if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
            sdf[(ly * width + lx) as usize] = d;
        }
        // Skeleton + endpoint detection. Spur prune length is keyed to the
        // glyph's max stroke half-width so tiny notches at thick strokes
        // don't masquerade as legs. (zhang_suen + spur_prune is the same
        // recipe the topo-stroke and skeleton-fill paths use.)
        let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
        let mut skel = zhang_suen_thin(&hull.pixels);
        let spur_len = (sdf_max * 1.5).round() as usize;
        prune_skeleton_spurs(&mut skel, spur_len.max(2));
        let skel_endpoints: Vec<(i32, i32)> = skel.iter()
            .filter(|&&p| zs_neighbors(p.0, p.1).iter().filter(|n| skel.contains(n)).count() == 1)
            .map(|&(x, y)| (x as i32, y as i32))
            .collect();
        // Skeleton length ≈ skeleton pixel count. For an 8-connected
        // skeleton this slightly under-counts diagonal segments (sqrt(2)
        // each), but it's close enough for a path-budget heuristic.
        let skeleton_length = skel.len() as u32;
        Self { bx, by, width, height, unpainted, was_ink, sdf, skel_endpoints,
               skeleton_length, ink_total: count, ink_remaining: count,
               brush_radius: 0.0, brush_radius_sq: 0.0, disk_offsets: Vec::new() }
    }
    /// Configure the disk shape used for evaluate_disk / paint_disk /
@@ -591,7 +656,7 @@ impl Grid {
    fn sdf_at(&self, x: i32, y: i32) -> f32 {
        let lx = x - self.bx; let ly = y - self.by;
        if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return 0.0; }
-        self.sdf[(ly * self.width + lx) as usize]
+        self.hull.sdf[(ly * self.width + lx) as usize]
    }
    /// Snap a raw pixel position onto the medial-axis ridge by greedy
@@ -665,7 +730,7 @@ impl Grid {
    fn is_ink(&self, x: i32, y: i32) -> bool {
        let lx = x - self.bx; let ly = y - self.by;
        if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return false; }
-        self.was_ink.get((ly * self.width + lx) as usize)
+        self.hull.was_ink.get((ly * self.width + lx) as usize)
    }
    /// Returns (new_ink, repaint_ink, bg) — pixel counts under disk(p, r):
@@ -695,7 +760,7 @@ impl Grid {
            let idx = (ly * self.width + lx) as usize;
            if self.unpainted.get(idx) {
                new_ink += 1;
-            } else if self.was_ink.get(idx) {
+            } else if self.hull.was_ink.get(idx) {
                repaint_ink += 1;
            } else {
                bg += 1;
@@ -816,7 +881,7 @@ impl Grid {
        let (pixels, _) = &components[chosen];
        let comp_set: HashSet<usize> = pixels.iter().copied().collect();
        let mut best_endpoint: Option<(i32, i32)> = None;
-        for &(ex, ey) in &self.skel_endpoints {
+        for &(ex, ey) in self.hull.skel_endpoints.iter() {
            let lx = ex - self.bx; let ly = ey - self.by;
            if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { continue; }
            let idx = (ly * self.width + lx) as usize;