brush-paint: cache hull-derived data (SDF + skeleton + was_ink) per hull

The dominant cost of one paint_fill_with call (~50% wall time at
small radii) is computing the chamfer-3-4 SDF and Zhang-Suen
skeleton, both pure functions of the hull. The optimizer calls
paint_fill_with thousands of times per hull while only varying
brush/walker params, so this work is fully redundant across calls.

New: a process-global Mutex<HashMap<HullKey, Arc<HullData>>> cache
keyed by FNV-1a fingerprint of the hull's pixel coordinates. First
call computes; subsequent calls hand back an Arc<HullData> in O(N)
fingerprint hash + O(1) lookup.

Grid now holds Arc<HullData> for the immutable hull-derived state
(was_ink, sdf, skel_endpoints) and clones only the mutable
`unpainted` mask per call. Bbox + skeleton_length are duplicated
into Grid so the disk-iteration hot path doesn't pay an Arc deref.

Bit-exact w.r.t. the alphabet report. Expected speedup is largest
in the optimizer's tight loop (many calls per hull); the alphabet
report only paints each hull once so most of its wins were from
the prior precompute/bitset commits.
This commit is contained in:
Mitchell Hansen
2026-05-07 00:50:58 -07:00
parent 8cc5b5cc8a
commit 34ee79f543

View File

@@ -12,7 +12,8 @@
// over direction has unpainted ink ahead, while alternate directions
// don't.
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::sync::{Arc, Mutex, OnceLock};
use rayon::prelude::*;
use crate::fill::{FillResult, rdp_simplify_f32, chamfer_distance,
zhang_suen_thin, prune_skeleton_spurs, zs_neighbors};
@@ -290,7 +291,7 @@ fn measure_sweep_full(strokes: &[Vec<(f32, f32)>], grid: &Grid)
-> (u32, u32, u32)
{
if strokes.is_empty() { return (0, 0, 0); }
let mut count = vec![0u32; grid.was_ink.len()];
let mut count = vec![0u32; grid.hull.was_ink.len()];
let r2 = grid.brush_radius_sq;
for stroke in strokes {
for win in stroke.windows(2) {
@@ -322,7 +323,7 @@ fn measure_sweep_full(strokes: &[Vec<(f32, f32)>], grid: &Grid)
for (i, &c) in count.iter().enumerate() {
if c == 0 { continue; }
total += 1;
if !grid.was_ink.get(i) { bg += 1; }
if !grid.hull.was_ink.get(i) { bg += 1; }
else { repaint += c - 1; }
}
(bg, total, repaint)
@@ -434,6 +435,7 @@ fn encode_coverage_b64(grid: &Grid) -> String {
/// fits L1 nicely, and word-at-a-time popcount is available when
/// scanning whole grids. All ops are `#[inline]` since they're called
/// from the disk-iteration hot path.
#[derive(Clone)]
struct BitMask {
bits: Vec<u64>,
len: usize,
@@ -461,26 +463,124 @@ impl BitMask {
}
}
// ── Coverage grid: bool per pixel, sized to the hull's bbox ─────────────
// ── Hull-derived data: cached per hull.id ───────────────────────────────
struct Grid {
/// Pure-function-of-the-hull state: the bbox/grid dimensions, ink mask,
/// chamfer SDF, and skeleton-endpoint set. Computing chamfer +
/// Zhang-Suen thin + spur-prune is the dominant cost of one
/// `paint_fill_with` call (~50% wall time at small radii). The
/// optimizer calls `paint_fill_with` thousands of times per hull while
/// only varying brush/walker params, so the result is identical every
/// time. A small `(hull.id) → Arc<HullData>` cache eliminates the
/// recomputation across calls.
struct HullData {
bx: i32, by: i32,
width: i32, height: i32,
/// `true` = ink pixel that hasn't been painted yet.
unpainted: BitMask,
/// `true` = pixel was ink in the original glyph (immutable; never
/// changes after construction). Lets relaxation tell "ink" apart from
/// "background" without conflating it with painted state.
was_ink: BitMask,
/// Chamfer 3-4 distance / 3 (≈ Euclidean px from boundary). Used to
/// snap raw start points up the gradient onto the medial-axis ridge,
/// so strokes begin at stroke-centerline rather than polygon-edge.
sdf: Vec<f32>,
/// Skeleton-endpoint pixel positions (degree-1 nodes of the thinned
/// glyph after spur pruning). These are the "legs" — the natural
/// pen-down anchors for a human writing the letter. A closed shape
/// (O, 0, etc.) has zero endpoints.
skel_endpoints: Vec<(i32, i32)>,
skeleton_length: u32,
ink_total: i32,
}
/// Cache key. `hull.id` alone isn't enough — extract_hulls assigns
/// IDs from a per-call counter, so distinct hulls from different
/// rasterizations collide on id. Mirror-image letters (p/q at the
/// same scale) can also share area + bounds. We use a full FNV-1a
/// hash over the pixel coordinate stream as the key — O(N) once
/// per cache miss, but conclusive against collisions.
type HullKey = u64;
fn hull_key(hull: &Hull) -> HullKey {
let mut h = 0xcbf29ce484222325u64;
for &(x, y) in &hull.pixels {
h ^= x as u64;
h = h.wrapping_mul(0x100000001b3);
h ^= y as u64;
h = h.wrapping_mul(0x100000001b3);
}
h
}
fn hull_cache() -> &'static Mutex<HashMap<HullKey, Arc<HullData>>> {
static CACHE: OnceLock<Mutex<HashMap<HullKey, Arc<HullData>>>> = OnceLock::new();
CACHE.get_or_init(|| Mutex::new(HashMap::new()))
}
fn get_or_compute_hull_data(hull: &Hull) -> Arc<HullData> {
let key = hull_key(hull);
{
let cache = hull_cache().lock().unwrap();
if let Some(c) = cache.get(&key) {
return c.clone();
}
}
// Compute outside the lock so concurrent callers for different
// hulls don't serialize. A small race is possible (two threads
// miss for the same hull and both compute) — both produce
// identical data, so the loser's copy is just dropped.
let computed = Arc::new(compute_hull_data(hull));
let mut cache = hull_cache().lock().unwrap();
cache.entry(key).or_insert_with(|| computed.clone()).clone()
}
fn compute_hull_data(hull: &Hull) -> HullData {
// Pad the grid past the hull's AABB so that bg pixels swept by a
// brush that overhangs the polygon (e.g. at the top of an `I`,
// or the corners of a square letter) are counted instead of
// silently dropped by the bounds check. PAD must exceed any
// brush_radius the optimizer might try.
const PAD: i32 = 32;
let bx = hull.bounds.x_min as i32 - PAD;
let by = hull.bounds.y_min as i32 - PAD;
let width = (hull.bounds.x_max as i32 - hull.bounds.x_min as i32 + 1 + 2 * PAD).max(1);
let height = (hull.bounds.y_max as i32 - hull.bounds.y_min as i32 + 1 + 2 * PAD).max(1);
let cells = (width * height) as usize;
let mut was_ink = BitMask::new(cells);
let mut sdf = vec![0.0_f32; cells];
let mut count = 0;
for &(x, y) in &hull.pixels {
let lx = x as i32 - bx; let ly = y as i32 - by;
if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
let idx = (ly * width + lx) as usize;
was_ink.set(idx);
count += 1;
}
let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
let dist = chamfer_distance(hull, &pixel_set);
for (&(x, y), &d) in dist.iter() {
let lx = x as i32 - bx; let ly = y as i32 - by;
if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
sdf[(ly * width + lx) as usize] = d;
}
let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
let mut skel = zhang_suen_thin(&hull.pixels);
let spur_len = (sdf_max * 1.5).round() as usize;
prune_skeleton_spurs(&mut skel, spur_len.max(2));
let skel_endpoints: Vec<(i32, i32)> = skel.iter()
.filter(|&&p| zs_neighbors(p.0, p.1).iter().filter(|n| skel.contains(n)).count() == 1)
.map(|&(x, y)| (x as i32, y as i32))
.collect();
let skeleton_length = skel.len() as u32;
HullData { bx, by, width, height, was_ink, sdf, skel_endpoints,
skeleton_length, ink_total: count }
}
// ── Coverage grid: per-call mutable state, sized to the hull's bbox ─────
struct Grid {
// Bbox is duplicated from `hull` so the disk-iteration hot path
// doesn't pay an Arc deref on every step.
bx: i32, by: i32,
width: i32, height: i32,
/// Cached hull-derived state: was_ink mask, SDF, skeleton
/// endpoints. Shared across all `paint_fill_with` calls on the
/// same hull via Arc — avoids recomputing chamfer + skeleton
/// per call. Read-only from this struct's perspective.
hull: Arc<HullData>,
/// `true` = ink pixel that hasn't been painted yet. Owned, mutable.
/// Initialized as a clone of `hull.was_ink`.
unpainted: BitMask,
/// Approximate medial-axis length, in pixels. Counted as skeleton
/// pixel count (each connected skeleton pixel contributes ~1 px to
/// the centerline length). Used as the "ideal" path length — a
@@ -507,58 +607,23 @@ struct Grid {
impl Grid {
fn from_hull(hull: &Hull) -> Self {
// Pad the grid past the hull's AABB so that bg pixels swept by a
// brush that overhangs the polygon (e.g. at the top of an `I`,
// or the corners of a square letter) are counted instead of
// silently dropped by the bounds check. PAD must exceed any
// brush_radius the optimizer might try.
const PAD: i32 = 32;
let bx = hull.bounds.x_min as i32 - PAD;
let by = hull.bounds.y_min as i32 - PAD;
let width = (hull.bounds.x_max as i32 - hull.bounds.x_min as i32 + 1 + 2 * PAD).max(1);
let height = (hull.bounds.y_max as i32 - hull.bounds.y_min as i32 + 1 + 2 * PAD).max(1);
let cells = (width * height) as usize;
let mut unpainted = BitMask::new(cells);
let mut was_ink = BitMask::new(cells);
let mut sdf = vec![0.0_f32; cells];
let mut count = 0;
for &(x, y) in &hull.pixels {
let lx = x as i32 - bx; let ly = y as i32 - by;
if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
let idx = (ly * width + lx) as usize;
unpainted.set(idx);
was_ink.set(idx);
count += 1;
let h = get_or_compute_hull_data(hull);
let unpainted = h.was_ink.clone();
let ink_total = h.ink_total;
let bx = h.bx; let by = h.by;
let width = h.width; let height = h.height;
let skeleton_length = h.skeleton_length;
Self {
bx, by, width, height,
hull: h,
unpainted,
skeleton_length,
ink_total,
ink_remaining: ink_total,
brush_radius: 0.0,
brush_radius_sq: 0.0,
disk_offsets: Vec::new(),
}
// Chamfer distance (per-pixel, in approximate Euclidean units)
// for medial-axis snapping in pick_start.
let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
let dist = chamfer_distance(hull, &pixel_set);
for (&(x, y), &d) in dist.iter() {
let lx = x as i32 - bx; let ly = y as i32 - by;
if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
sdf[(ly * width + lx) as usize] = d;
}
// Skeleton + endpoint detection. Spur prune length is keyed to the
// glyph's max stroke half-width so tiny notches at thick strokes
// don't masquerade as legs. (zhang_suen + spur_prune is the same
// recipe the topo-stroke and skeleton-fill paths use.)
let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
let mut skel = zhang_suen_thin(&hull.pixels);
let spur_len = (sdf_max * 1.5).round() as usize;
prune_skeleton_spurs(&mut skel, spur_len.max(2));
let skel_endpoints: Vec<(i32, i32)> = skel.iter()
.filter(|&&p| zs_neighbors(p.0, p.1).iter().filter(|n| skel.contains(n)).count() == 1)
.map(|&(x, y)| (x as i32, y as i32))
.collect();
// Skeleton length ≈ skeleton pixel count. For an 8-connected
// skeleton this slightly under-counts diagonal segments (sqrt(2)
// each), but it's close enough for a path-budget heuristic.
let skeleton_length = skel.len() as u32;
Self { bx, by, width, height, unpainted, was_ink, sdf, skel_endpoints,
skeleton_length, ink_total: count, ink_remaining: count,
brush_radius: 0.0, brush_radius_sq: 0.0, disk_offsets: Vec::new() }
}
/// Configure the disk shape used for evaluate_disk / paint_disk /
@@ -591,7 +656,7 @@ impl Grid {
fn sdf_at(&self, x: i32, y: i32) -> f32 {
let lx = x - self.bx; let ly = y - self.by;
if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return 0.0; }
self.sdf[(ly * self.width + lx) as usize]
self.hull.sdf[(ly * self.width + lx) as usize]
}
/// Snap a raw pixel position onto the medial-axis ridge by greedy
@@ -665,7 +730,7 @@ impl Grid {
fn is_ink(&self, x: i32, y: i32) -> bool {
let lx = x - self.bx; let ly = y - self.by;
if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return false; }
self.was_ink.get((ly * self.width + lx) as usize)
self.hull.was_ink.get((ly * self.width + lx) as usize)
}
/// Returns (new_ink, repaint_ink, bg) — pixel counts under disk(p, r):
@@ -695,7 +760,7 @@ impl Grid {
let idx = (ly * self.width + lx) as usize;
if self.unpainted.get(idx) {
new_ink += 1;
} else if self.was_ink.get(idx) {
} else if self.hull.was_ink.get(idx) {
repaint_ink += 1;
} else {
bg += 1;
@@ -816,7 +881,7 @@ impl Grid {
let (pixels, _) = &components[chosen];
let comp_set: HashSet<usize> = pixels.iter().copied().collect();
let mut best_endpoint: Option<(i32, i32)> = None;
for &(ex, ey) in &self.skel_endpoints {
for &(ex, ey) in self.hull.skel_endpoints.iter() {
let lx = ex - self.bx; let ly = ey - self.by;
if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { continue; }
let idx = (ly * self.width + lx) as usize;