From 34ee79f54326c2bc1aa49b0e340e39e639d408b2 Mon Sep 17 00:00:00 2001
From: Mitchell Hansen <mitch@pedantek.com>
Date: Thu, 7 May 2026 00:50:58 -0700
Subject: [PATCH] brush-paint: cache hull-derived data (SDF + skeleton +
 was_ink) per hull

The dominant cost of one paint_fill_with call (~50% wall time at
small radii) is computing the chamfer-3-4 SDF and Zhang-Suen
skeleton, both pure functions of the hull. The optimizer calls
paint_fill_with thousands of times per hull while only varying
brush/walker params, so this work is fully redundant across calls.

New: a process-global Mutex<HashMap<HullKey, Arc<HullData>>> cache
keyed by FNV-1a fingerprint of the hull's pixel coordinates. First
call computes; subsequent calls hand back an Arc<HullData> in O(N)
fingerprint hash + O(1) lookup.

Grid now holds Arc<HullData> for the immutable hull-derived state
(was_ink, sdf, skel_endpoints) and clones only the mutable
`unpainted` mask per call. Bbox + skeleton_length are duplicated
into Grid so the disk-iteration hot path doesn't pay an Arc deref.

Bit-exact w.r.t. the alphabet report. Expected speedup is largest
in the optimizer's tight loop (many calls per hull); the alphabet
report only paints each hull once so most of its wins were from
the prior precompute/bitset commits.
---
 src/brush_paint.rs | 209 +++++++++++++++++++++++++++++----------------
 1 file changed, 137 insertions(+), 72 deletions(-)
diff --git a/src/brush_paint.rs b/src/brush_paint.rs
index 92fcfdfc..08ccdac5 100644
--- a/src/brush_paint.rs
+++ b/src/brush_paint.rs
@@ -12,7 +12,8 @@
 // over direction has unpainted ink ahead, while alternate directions
 // don't.
 
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
+use std::sync::{Arc, Mutex, OnceLock};
 use rayon::prelude::*;
 use crate::fill::{FillResult, rdp_simplify_f32, chamfer_distance,
     zhang_suen_thin, prune_skeleton_spurs, zs_neighbors};
@@ -290,7 +291,7 @@ fn measure_sweep_full(strokes: &[Vec<(f32, f32)>], grid: &Grid)
     -> (u32, u32, u32)
 {
     if strokes.is_empty() { return (0, 0, 0); }
-    let mut count = vec![0u32; grid.was_ink.len()];
+    let mut count = vec![0u32; grid.hull.was_ink.len()];
     let r2 = grid.brush_radius_sq;
     for stroke in strokes {
         for win in stroke.windows(2) {
@@ -322,7 +323,7 @@ fn measure_sweep_full(strokes: &[Vec<(f32, f32)>], grid: &Grid)
     for (i, &c) in count.iter().enumerate() {
         if c == 0 { continue; }
         total += 1;
-        if !grid.was_ink.get(i) { bg += 1; }
+        if !grid.hull.was_ink.get(i) { bg += 1; }
         else { repaint += c - 1; }
     }
     (bg, total, repaint)
@@ -434,6 +435,7 @@ fn encode_coverage_b64(grid: &Grid) -> String {
 /// fits L1 nicely, and word-at-a-time popcount is available when
 /// scanning whole grids. All ops are `#[inline]` since they're called
 /// from the disk-iteration hot path.
+#[derive(Clone)]
 struct BitMask {
     bits: Vec<u64>,
     len:  usize,
@@ -461,26 +463,124 @@ impl BitMask {
     }
 }
 
-// ── Coverage grid: bool per pixel, sized to the hull's bbox ─────────────
+// ── Hull-derived data: cached per hull.id ───────────────────────────────
 
-struct Grid {
+/// Pure-function-of-the-hull state: the bbox/grid dimensions, ink mask,
+/// chamfer SDF, and skeleton-endpoint set. Computing chamfer +
+/// Zhang-Suen thin + spur-prune is the dominant cost of one
+/// `paint_fill_with` call (~50% wall time at small radii). The
+/// optimizer calls `paint_fill_with` thousands of times per hull while
+/// only varying brush/walker params, so the result is identical every
+/// time. A small `(hull.id) → Arc<HullData>` cache eliminates the
+/// recomputation across calls.
+struct HullData {
     bx: i32, by: i32,
     width: i32, height: i32,
-    /// `true` = ink pixel that hasn't been painted yet.
-    unpainted: BitMask,
-    /// `true` = pixel was ink in the original glyph (immutable; never
-    /// changes after construction). Lets relaxation tell "ink" apart from
-    /// "background" without conflating it with painted state.
     was_ink: BitMask,
-    /// Chamfer 3-4 distance / 3 (≈ Euclidean px from boundary). Used to
-    /// snap raw start points up the gradient onto the medial-axis ridge,
-    /// so strokes begin at stroke-centerline rather than polygon-edge.
     sdf: Vec<f32>,
-    /// Skeleton-endpoint pixel positions (degree-1 nodes of the thinned
-    /// glyph after spur pruning). These are the "legs" — the natural
-    /// pen-down anchors for a human writing the letter. A closed shape
-    /// (O, 0, etc.) has zero endpoints.
     skel_endpoints: Vec<(i32, i32)>,
+    skeleton_length: u32,
+    ink_total: i32,
+}
+
+/// Cache key. `hull.id` alone isn't enough — extract_hulls assigns
+/// IDs from a per-call counter, so distinct hulls from different
+/// rasterizations collide on id. Mirror-image letters (p/q at the
+/// same scale) can also share area + bounds. We use a full FNV-1a
+/// hash over the pixel coordinate stream as the key — O(N) once
+/// per cache miss, but conclusive against collisions.
+type HullKey = u64;
+
+fn hull_key(hull: &Hull) -> HullKey {
+    let mut h = 0xcbf29ce484222325u64;
+    for &(x, y) in &hull.pixels {
+        h ^= x as u64;
+        h = h.wrapping_mul(0x100000001b3);
+        h ^= y as u64;
+        h = h.wrapping_mul(0x100000001b3);
+    }
+    h
+}
+
+fn hull_cache() -> &'static Mutex<HashMap<HullKey, Arc<HullData>>> {
+    static CACHE: OnceLock<Mutex<HashMap<HullKey, Arc<HullData>>>> = OnceLock::new();
+    CACHE.get_or_init(|| Mutex::new(HashMap::new()))
+}
+
+fn get_or_compute_hull_data(hull: &Hull) -> Arc<HullData> {
+    let key = hull_key(hull);
+    {
+        let cache = hull_cache().lock().unwrap();
+        if let Some(c) = cache.get(&key) {
+            return c.clone();
+        }
+    }
+    // Compute outside the lock so concurrent callers for different
+    // hulls don't serialize. A small race is possible (two threads
+    // miss for the same hull and both compute) — both produce
+    // identical data, so the loser's copy is just dropped.
+    let computed = Arc::new(compute_hull_data(hull));
+    let mut cache = hull_cache().lock().unwrap();
+    cache.entry(key).or_insert_with(|| computed.clone()).clone()
+}
+
+fn compute_hull_data(hull: &Hull) -> HullData {
+    // Pad the grid past the hull's AABB so that bg pixels swept by a
+    // brush that overhangs the polygon (e.g. at the top of an `I`,
+    // or the corners of a square letter) are counted instead of
+    // silently dropped by the bounds check. PAD must exceed any
+    // brush_radius the optimizer might try.
+    const PAD: i32 = 32;
+    let bx = hull.bounds.x_min as i32 - PAD;
+    let by = hull.bounds.y_min as i32 - PAD;
+    let width  = (hull.bounds.x_max as i32 - hull.bounds.x_min as i32 + 1 + 2 * PAD).max(1);
+    let height = (hull.bounds.y_max as i32 - hull.bounds.y_min as i32 + 1 + 2 * PAD).max(1);
+    let cells = (width * height) as usize;
+    let mut was_ink = BitMask::new(cells);
+    let mut sdf     = vec![0.0_f32; cells];
+    let mut count = 0;
+    for &(x, y) in &hull.pixels {
+        let lx = x as i32 - bx; let ly = y as i32 - by;
+        if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
+        let idx = (ly * width + lx) as usize;
+        was_ink.set(idx);
+        count += 1;
+    }
+    let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
+    let dist = chamfer_distance(hull, &pixel_set);
+    for (&(x, y), &d) in dist.iter() {
+        let lx = x as i32 - bx; let ly = y as i32 - by;
+        if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
+        sdf[(ly * width + lx) as usize] = d;
+    }
+    let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
+    let mut skel = zhang_suen_thin(&hull.pixels);
+    let spur_len = (sdf_max * 1.5).round() as usize;
+    prune_skeleton_spurs(&mut skel, spur_len.max(2));
+    let skel_endpoints: Vec<(i32, i32)> = skel.iter()
+        .filter(|&&p| zs_neighbors(p.0, p.1).iter().filter(|n| skel.contains(n)).count() == 1)
+        .map(|&(x, y)| (x as i32, y as i32))
+        .collect();
+    let skeleton_length = skel.len() as u32;
+    HullData { bx, by, width, height, was_ink, sdf, skel_endpoints,
+               skeleton_length, ink_total: count }
+}
+
+// ── Coverage grid: per-call mutable state, sized to the hull's bbox ─────
+
+struct Grid {
+    // Bbox is duplicated from `hull` so the disk-iteration hot path
+    // doesn't pay an Arc deref on every step.
+    bx: i32, by: i32,
+    width: i32, height: i32,
+    /// Cached hull-derived state: was_ink mask, SDF, skeleton
+    /// endpoints. Shared across all `paint_fill_with` calls on the
+    /// same hull via Arc — avoids recomputing chamfer + skeleton
+    /// per call. Read-only from this struct's perspective.
+    hull: Arc<HullData>,
+    /// `true` = ink pixel that hasn't been painted yet. Owned, mutable.
+    /// Initialized as a clone of `hull.was_ink`.
+    unpainted: BitMask,
     /// Approximate medial-axis length, in pixels. Counted as skeleton
     /// pixel count (each connected skeleton pixel contributes ~1 px to
     /// the centerline length). Used as the "ideal" path length — a
@@ -507,58 +607,23 @@ struct Grid {
 
 impl Grid {
     fn from_hull(hull: &Hull) -> Self {
-        // Pad the grid past the hull's AABB so that bg pixels swept by a
-        // brush that overhangs the polygon (e.g. at the top of an `I`,
-        // or the corners of a square letter) are counted instead of
-        // silently dropped by the bounds check. PAD must exceed any
-        // brush_radius the optimizer might try.
-        const PAD: i32 = 32;
-        let bx = hull.bounds.x_min as i32 - PAD;
-        let by = hull.bounds.y_min as i32 - PAD;
-        let width  = (hull.bounds.x_max as i32 - hull.bounds.x_min as i32 + 1 + 2 * PAD).max(1);
-        let height = (hull.bounds.y_max as i32 - hull.bounds.y_min as i32 + 1 + 2 * PAD).max(1);
-        let cells = (width * height) as usize;
-        let mut unpainted = BitMask::new(cells);
-        let mut was_ink   = BitMask::new(cells);
-        let mut sdf       = vec![0.0_f32; cells];
-        let mut count = 0;
-        for &(x, y) in &hull.pixels {
-            let lx = x as i32 - bx; let ly = y as i32 - by;
-            if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
-            let idx = (ly * width + lx) as usize;
-            unpainted.set(idx);
-            was_ink.set(idx);
-            count += 1;
+        let h = get_or_compute_hull_data(hull);
+        let unpainted = h.was_ink.clone();
+        let ink_total = h.ink_total;
+        let bx = h.bx; let by = h.by;
+        let width = h.width; let height = h.height;
+        let skeleton_length = h.skeleton_length;
+        Self {
+            bx, by, width, height,
+            hull: h,
+            unpainted,
+            skeleton_length,
+            ink_total,
+            ink_remaining: ink_total,
+            brush_radius: 0.0,
+            brush_radius_sq: 0.0,
+            disk_offsets: Vec::new(),
         }
-        // Chamfer distance (per-pixel, in approximate Euclidean units)
-        // for medial-axis snapping in pick_start.
-        let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
-        let dist = chamfer_distance(hull, &pixel_set);
-        for (&(x, y), &d) in dist.iter() {
-            let lx = x as i32 - bx; let ly = y as i32 - by;
-            if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
-            sdf[(ly * width + lx) as usize] = d;
-        }
-        // Skeleton + endpoint detection. Spur prune length is keyed to the
-        // glyph's max stroke half-width so tiny notches at thick strokes
-        // don't masquerade as legs. (zhang_suen + spur_prune is the same
-        // recipe the topo-stroke and skeleton-fill paths use.)
-        let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
-        let mut skel = zhang_suen_thin(&hull.pixels);
-        let spur_len = (sdf_max * 1.5).round() as usize;
-        prune_skeleton_spurs(&mut skel, spur_len.max(2));
-        let skel_endpoints: Vec<(i32, i32)> = skel.iter()
-            .filter(|&&p| zs_neighbors(p.0, p.1).iter().filter(|n| skel.contains(n)).count() == 1)
-            .map(|&(x, y)| (x as i32, y as i32))
-            .collect();
-        // Skeleton length ≈ skeleton pixel count. For an 8-connected
-        // skeleton this slightly under-counts diagonal segments (sqrt(2)
-        // each), but it's close enough for a path-budget heuristic.
-        let skeleton_length = skel.len() as u32;
-
-        Self { bx, by, width, height, unpainted, was_ink, sdf, skel_endpoints,
-               skeleton_length, ink_total: count, ink_remaining: count,
-               brush_radius: 0.0, brush_radius_sq: 0.0, disk_offsets: Vec::new() }
     }
 
     /// Configure the disk shape used for evaluate_disk / paint_disk /
@@ -591,7 +656,7 @@ impl Grid {
     fn sdf_at(&self, x: i32, y: i32) -> f32 {
         let lx = x - self.bx; let ly = y - self.by;
         if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return 0.0; }
-        self.sdf[(ly * self.width + lx) as usize]
+        self.hull.sdf[(ly * self.width + lx) as usize]
     }
 
     /// Snap a raw pixel position onto the medial-axis ridge by greedy
@@ -665,7 +730,7 @@ impl Grid {
     fn is_ink(&self, x: i32, y: i32) -> bool {
         let lx = x - self.bx; let ly = y - self.by;
         if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return false; }
-        self.was_ink.get((ly * self.width + lx) as usize)
+        self.hull.was_ink.get((ly * self.width + lx) as usize)
     }
 
     /// Returns (new_ink, repaint_ink, bg) — pixel counts under disk(p, r):
@@ -695,7 +760,7 @@ impl Grid {
             let idx = (ly * self.width + lx) as usize;
             if self.unpainted.get(idx) {
                 new_ink += 1;
-            } else if self.was_ink.get(idx) {
+            } else if self.hull.was_ink.get(idx) {
                 repaint_ink += 1;
             } else {
                 bg += 1;
@@ -816,7 +881,7 @@ impl Grid {
         let (pixels, _) = &components[chosen];
         let comp_set: HashSet<usize> = pixels.iter().copied().collect();
         let mut best_endpoint: Option<(i32, i32)> = None;
-        for &(ex, ey) in &self.skel_endpoints {
+        for &(ex, ey) in self.hull.skel_endpoints.iter() {
             let lx = ex - self.bx; let ly = ey - self.by;
             if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { continue; }
             let idx = (ly * self.width + lx) as usize;