brush-paint: cache hull-derived data (SDF + skeleton + was_ink) per hull
The dominant cost of one paint_fill_with call (~50% wall time at small radii) is computing the chamfer-3-4 SDF and Zhang-Suen skeleton, both pure functions of the hull. The optimizer calls paint_fill_with thousands of times per hull while only varying brush/walker params, so this work is fully redundant across calls. New: a process-global Mutex<HashMap<HullKey, Arc<HullData>>> cache keyed by FNV-1a fingerprint of the hull's pixel coordinates. First call computes; subsequent calls hand back an Arc<HullData> in O(N) fingerprint hash + O(1) lookup. Grid now holds Arc<HullData> for the immutable hull-derived state (was_ink, sdf, skel_endpoints) and clones only the mutable `unpainted` mask per call. Bbox + skeleton_length are duplicated into Grid so the disk-iteration hot path doesn't pay an Arc deref. Bit-exact w.r.t. the alphabet report. Expected speedup is largest in the optimizer's tight loop (many calls per hull); the alphabet report only paints each hull once so most of its wins were from the prior precompute/bitset commits.
This commit is contained in:
@@ -12,7 +12,8 @@
|
||||
// over direction has unpainted ink ahead, while alternate directions
|
||||
// don't.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, Mutex, OnceLock};
|
||||
use rayon::prelude::*;
|
||||
use crate::fill::{FillResult, rdp_simplify_f32, chamfer_distance,
|
||||
zhang_suen_thin, prune_skeleton_spurs, zs_neighbors};
|
||||
@@ -290,7 +291,7 @@ fn measure_sweep_full(strokes: &[Vec<(f32, f32)>], grid: &Grid)
|
||||
-> (u32, u32, u32)
|
||||
{
|
||||
if strokes.is_empty() { return (0, 0, 0); }
|
||||
let mut count = vec![0u32; grid.was_ink.len()];
|
||||
let mut count = vec![0u32; grid.hull.was_ink.len()];
|
||||
let r2 = grid.brush_radius_sq;
|
||||
for stroke in strokes {
|
||||
for win in stroke.windows(2) {
|
||||
@@ -322,7 +323,7 @@ fn measure_sweep_full(strokes: &[Vec<(f32, f32)>], grid: &Grid)
|
||||
for (i, &c) in count.iter().enumerate() {
|
||||
if c == 0 { continue; }
|
||||
total += 1;
|
||||
if !grid.was_ink.get(i) { bg += 1; }
|
||||
if !grid.hull.was_ink.get(i) { bg += 1; }
|
||||
else { repaint += c - 1; }
|
||||
}
|
||||
(bg, total, repaint)
|
||||
@@ -434,6 +435,7 @@ fn encode_coverage_b64(grid: &Grid) -> String {
|
||||
/// fits L1 nicely, and word-at-a-time popcount is available when
|
||||
/// scanning whole grids. All ops are `#[inline]` since they're called
|
||||
/// from the disk-iteration hot path.
|
||||
#[derive(Clone)]
|
||||
struct BitMask {
|
||||
bits: Vec<u64>,
|
||||
len: usize,
|
||||
@@ -461,26 +463,124 @@ impl BitMask {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Coverage grid: bool per pixel, sized to the hull's bbox ─────────────
|
||||
// ── Hull-derived data: cached per hull.id ───────────────────────────────
|
||||
|
||||
struct Grid {
|
||||
/// Pure-function-of-the-hull state: the bbox/grid dimensions, ink mask,
|
||||
/// chamfer SDF, and skeleton-endpoint set. Computing chamfer +
|
||||
/// Zhang-Suen thin + spur-prune is the dominant cost of one
|
||||
/// `paint_fill_with` call (~50% wall time at small radii). The
|
||||
/// optimizer calls `paint_fill_with` thousands of times per hull while
|
||||
/// only varying brush/walker params, so the result is identical every
|
||||
/// time. A small `(hull.id) → Arc<HullData>` cache eliminates the
|
||||
/// recomputation across calls.
|
||||
struct HullData {
|
||||
bx: i32, by: i32,
|
||||
width: i32, height: i32,
|
||||
/// `true` = ink pixel that hasn't been painted yet.
|
||||
unpainted: BitMask,
|
||||
/// `true` = pixel was ink in the original glyph (immutable; never
|
||||
/// changes after construction). Lets relaxation tell "ink" apart from
|
||||
/// "background" without conflating it with painted state.
|
||||
was_ink: BitMask,
|
||||
/// Chamfer 3-4 distance / 3 (≈ Euclidean px from boundary). Used to
|
||||
/// snap raw start points up the gradient onto the medial-axis ridge,
|
||||
/// so strokes begin at stroke-centerline rather than polygon-edge.
|
||||
sdf: Vec<f32>,
|
||||
/// Skeleton-endpoint pixel positions (degree-1 nodes of the thinned
|
||||
/// glyph after spur pruning). These are the "legs" — the natural
|
||||
/// pen-down anchors for a human writing the letter. A closed shape
|
||||
/// (O, 0, etc.) has zero endpoints.
|
||||
skel_endpoints: Vec<(i32, i32)>,
|
||||
skeleton_length: u32,
|
||||
ink_total: i32,
|
||||
}
|
||||
|
||||
/// Cache key. `hull.id` alone isn't enough — extract_hulls assigns
|
||||
/// IDs from a per-call counter, so distinct hulls from different
|
||||
/// rasterizations collide on id. Mirror-image letters (p/q at the
|
||||
/// same scale) can also share area + bounds. We use a full FNV-1a
|
||||
/// hash over the pixel coordinate stream as the key — O(N) once
|
||||
/// per cache miss, but conclusive against collisions.
|
||||
type HullKey = u64;
|
||||
|
||||
fn hull_key(hull: &Hull) -> HullKey {
|
||||
let mut h = 0xcbf29ce484222325u64;
|
||||
for &(x, y) in &hull.pixels {
|
||||
h ^= x as u64;
|
||||
h = h.wrapping_mul(0x100000001b3);
|
||||
h ^= y as u64;
|
||||
h = h.wrapping_mul(0x100000001b3);
|
||||
}
|
||||
h
|
||||
}
|
||||
|
||||
fn hull_cache() -> &'static Mutex<HashMap<HullKey, Arc<HullData>>> {
|
||||
static CACHE: OnceLock<Mutex<HashMap<HullKey, Arc<HullData>>>> = OnceLock::new();
|
||||
CACHE.get_or_init(|| Mutex::new(HashMap::new()))
|
||||
}
|
||||
|
||||
fn get_or_compute_hull_data(hull: &Hull) -> Arc<HullData> {
|
||||
let key = hull_key(hull);
|
||||
{
|
||||
let cache = hull_cache().lock().unwrap();
|
||||
if let Some(c) = cache.get(&key) {
|
||||
return c.clone();
|
||||
}
|
||||
}
|
||||
// Compute outside the lock so concurrent callers for different
|
||||
// hulls don't serialize. A small race is possible (two threads
|
||||
// miss for the same hull and both compute) — both produce
|
||||
// identical data, so the loser's copy is just dropped.
|
||||
let computed = Arc::new(compute_hull_data(hull));
|
||||
let mut cache = hull_cache().lock().unwrap();
|
||||
cache.entry(key).or_insert_with(|| computed.clone()).clone()
|
||||
}
|
||||
|
||||
fn compute_hull_data(hull: &Hull) -> HullData {
|
||||
// Pad the grid past the hull's AABB so that bg pixels swept by a
|
||||
// brush that overhangs the polygon (e.g. at the top of an `I`,
|
||||
// or the corners of a square letter) are counted instead of
|
||||
// silently dropped by the bounds check. PAD must exceed any
|
||||
// brush_radius the optimizer might try.
|
||||
const PAD: i32 = 32;
|
||||
let bx = hull.bounds.x_min as i32 - PAD;
|
||||
let by = hull.bounds.y_min as i32 - PAD;
|
||||
let width = (hull.bounds.x_max as i32 - hull.bounds.x_min as i32 + 1 + 2 * PAD).max(1);
|
||||
let height = (hull.bounds.y_max as i32 - hull.bounds.y_min as i32 + 1 + 2 * PAD).max(1);
|
||||
let cells = (width * height) as usize;
|
||||
let mut was_ink = BitMask::new(cells);
|
||||
let mut sdf = vec![0.0_f32; cells];
|
||||
let mut count = 0;
|
||||
for &(x, y) in &hull.pixels {
|
||||
let lx = x as i32 - bx; let ly = y as i32 - by;
|
||||
if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
|
||||
let idx = (ly * width + lx) as usize;
|
||||
was_ink.set(idx);
|
||||
count += 1;
|
||||
}
|
||||
let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
|
||||
let dist = chamfer_distance(hull, &pixel_set);
|
||||
for (&(x, y), &d) in dist.iter() {
|
||||
let lx = x as i32 - bx; let ly = y as i32 - by;
|
||||
if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
|
||||
sdf[(ly * width + lx) as usize] = d;
|
||||
}
|
||||
let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
|
||||
let mut skel = zhang_suen_thin(&hull.pixels);
|
||||
let spur_len = (sdf_max * 1.5).round() as usize;
|
||||
prune_skeleton_spurs(&mut skel, spur_len.max(2));
|
||||
let skel_endpoints: Vec<(i32, i32)> = skel.iter()
|
||||
.filter(|&&p| zs_neighbors(p.0, p.1).iter().filter(|n| skel.contains(n)).count() == 1)
|
||||
.map(|&(x, y)| (x as i32, y as i32))
|
||||
.collect();
|
||||
let skeleton_length = skel.len() as u32;
|
||||
HullData { bx, by, width, height, was_ink, sdf, skel_endpoints,
|
||||
skeleton_length, ink_total: count }
|
||||
}
|
||||
|
||||
// ── Coverage grid: per-call mutable state, sized to the hull's bbox ─────
|
||||
|
||||
struct Grid {
|
||||
// Bbox is duplicated from `hull` so the disk-iteration hot path
|
||||
// doesn't pay an Arc deref on every step.
|
||||
bx: i32, by: i32,
|
||||
width: i32, height: i32,
|
||||
/// Cached hull-derived state: was_ink mask, SDF, skeleton
|
||||
/// endpoints. Shared across all `paint_fill_with` calls on the
|
||||
/// same hull via Arc — avoids recomputing chamfer + skeleton
|
||||
/// per call. Read-only from this struct's perspective.
|
||||
hull: Arc<HullData>,
|
||||
/// `true` = ink pixel that hasn't been painted yet. Owned, mutable.
|
||||
/// Initialized as a clone of `hull.was_ink`.
|
||||
unpainted: BitMask,
|
||||
/// Approximate medial-axis length, in pixels. Counted as skeleton
|
||||
/// pixel count (each connected skeleton pixel contributes ~1 px to
|
||||
/// the centerline length). Used as the "ideal" path length — a
|
||||
@@ -507,58 +607,23 @@ struct Grid {
|
||||
|
||||
impl Grid {
|
||||
fn from_hull(hull: &Hull) -> Self {
|
||||
// Pad the grid past the hull's AABB so that bg pixels swept by a
|
||||
// brush that overhangs the polygon (e.g. at the top of an `I`,
|
||||
// or the corners of a square letter) are counted instead of
|
||||
// silently dropped by the bounds check. PAD must exceed any
|
||||
// brush_radius the optimizer might try.
|
||||
const PAD: i32 = 32;
|
||||
let bx = hull.bounds.x_min as i32 - PAD;
|
||||
let by = hull.bounds.y_min as i32 - PAD;
|
||||
let width = (hull.bounds.x_max as i32 - hull.bounds.x_min as i32 + 1 + 2 * PAD).max(1);
|
||||
let height = (hull.bounds.y_max as i32 - hull.bounds.y_min as i32 + 1 + 2 * PAD).max(1);
|
||||
let cells = (width * height) as usize;
|
||||
let mut unpainted = BitMask::new(cells);
|
||||
let mut was_ink = BitMask::new(cells);
|
||||
let mut sdf = vec![0.0_f32; cells];
|
||||
let mut count = 0;
|
||||
for &(x, y) in &hull.pixels {
|
||||
let lx = x as i32 - bx; let ly = y as i32 - by;
|
||||
if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
|
||||
let idx = (ly * width + lx) as usize;
|
||||
unpainted.set(idx);
|
||||
was_ink.set(idx);
|
||||
count += 1;
|
||||
let h = get_or_compute_hull_data(hull);
|
||||
let unpainted = h.was_ink.clone();
|
||||
let ink_total = h.ink_total;
|
||||
let bx = h.bx; let by = h.by;
|
||||
let width = h.width; let height = h.height;
|
||||
let skeleton_length = h.skeleton_length;
|
||||
Self {
|
||||
bx, by, width, height,
|
||||
hull: h,
|
||||
unpainted,
|
||||
skeleton_length,
|
||||
ink_total,
|
||||
ink_remaining: ink_total,
|
||||
brush_radius: 0.0,
|
||||
brush_radius_sq: 0.0,
|
||||
disk_offsets: Vec::new(),
|
||||
}
|
||||
// Chamfer distance (per-pixel, in approximate Euclidean units)
|
||||
// for medial-axis snapping in pick_start.
|
||||
let pixel_set: HashSet<(u32, u32)> = hull.pixels.iter().copied().collect();
|
||||
let dist = chamfer_distance(hull, &pixel_set);
|
||||
for (&(x, y), &d) in dist.iter() {
|
||||
let lx = x as i32 - bx; let ly = y as i32 - by;
|
||||
if lx < 0 || ly < 0 || lx >= width || ly >= height { continue; }
|
||||
sdf[(ly * width + lx) as usize] = d;
|
||||
}
|
||||
// Skeleton + endpoint detection. Spur prune length is keyed to the
|
||||
// glyph's max stroke half-width so tiny notches at thick strokes
|
||||
// don't masquerade as legs. (zhang_suen + spur_prune is the same
|
||||
// recipe the topo-stroke and skeleton-fill paths use.)
|
||||
let sdf_max = dist.values().copied().fold(0.0_f32, f32::max).max(0.5);
|
||||
let mut skel = zhang_suen_thin(&hull.pixels);
|
||||
let spur_len = (sdf_max * 1.5).round() as usize;
|
||||
prune_skeleton_spurs(&mut skel, spur_len.max(2));
|
||||
let skel_endpoints: Vec<(i32, i32)> = skel.iter()
|
||||
.filter(|&&p| zs_neighbors(p.0, p.1).iter().filter(|n| skel.contains(n)).count() == 1)
|
||||
.map(|&(x, y)| (x as i32, y as i32))
|
||||
.collect();
|
||||
// Skeleton length ≈ skeleton pixel count. For an 8-connected
|
||||
// skeleton this slightly under-counts diagonal segments (sqrt(2)
|
||||
// each), but it's close enough for a path-budget heuristic.
|
||||
let skeleton_length = skel.len() as u32;
|
||||
|
||||
Self { bx, by, width, height, unpainted, was_ink, sdf, skel_endpoints,
|
||||
skeleton_length, ink_total: count, ink_remaining: count,
|
||||
brush_radius: 0.0, brush_radius_sq: 0.0, disk_offsets: Vec::new() }
|
||||
}
|
||||
|
||||
/// Configure the disk shape used for evaluate_disk / paint_disk /
|
||||
@@ -591,7 +656,7 @@ impl Grid {
|
||||
fn sdf_at(&self, x: i32, y: i32) -> f32 {
|
||||
let lx = x - self.bx; let ly = y - self.by;
|
||||
if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return 0.0; }
|
||||
self.sdf[(ly * self.width + lx) as usize]
|
||||
self.hull.sdf[(ly * self.width + lx) as usize]
|
||||
}
|
||||
|
||||
/// Snap a raw pixel position onto the medial-axis ridge by greedy
|
||||
@@ -665,7 +730,7 @@ impl Grid {
|
||||
fn is_ink(&self, x: i32, y: i32) -> bool {
|
||||
let lx = x - self.bx; let ly = y - self.by;
|
||||
if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { return false; }
|
||||
self.was_ink.get((ly * self.width + lx) as usize)
|
||||
self.hull.was_ink.get((ly * self.width + lx) as usize)
|
||||
}
|
||||
|
||||
/// Returns (new_ink, repaint_ink, bg) — pixel counts under disk(p, r):
|
||||
@@ -695,7 +760,7 @@ impl Grid {
|
||||
let idx = (ly * self.width + lx) as usize;
|
||||
if self.unpainted.get(idx) {
|
||||
new_ink += 1;
|
||||
} else if self.was_ink.get(idx) {
|
||||
} else if self.hull.was_ink.get(idx) {
|
||||
repaint_ink += 1;
|
||||
} else {
|
||||
bg += 1;
|
||||
@@ -816,7 +881,7 @@ impl Grid {
|
||||
let (pixels, _) = &components[chosen];
|
||||
let comp_set: HashSet<usize> = pixels.iter().copied().collect();
|
||||
let mut best_endpoint: Option<(i32, i32)> = None;
|
||||
for &(ex, ey) in &self.skel_endpoints {
|
||||
for &(ex, ey) in self.hull.skel_endpoints.iter() {
|
||||
let lx = ex - self.bx; let ly = ey - self.by;
|
||||
if lx < 0 || ly < 0 || lx >= self.width || ly >= self.height { continue; }
|
||||
let idx = (ly * self.width + lx) as usize;
|
||||
|
||||
Reference in New Issue
Block a user