From 9190661d7a8eb2ad47886bcd293aeaa9dcc33d77 Mon Sep 17 00:00:00 2001 From: Mitchell Hansen Date: Sat, 2 May 2026 00:27:50 -0700 Subject: [PATCH] brush-paint-opt: distributed meta-optimizer worker + orchestrator paint_meta_opt_worker takes one outer-idx, builds the ScoreWeights for that index, runs the full inner optimizer under those weights, and prints a MetaResult JSON (matching the in-process struct). scripts/meta_optimize_distributed.sh splits N outer samples between local and a remote SSH host, runs them serially on each host (each already saturates rayon internally), and lex-sorts the merged JSON by the same ordering compare_reports uses. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.toml | 4 + scripts/meta_optimize_distributed.sh | 137 +++++++++++++++++++++++++++ src/bin/paint_meta_opt_worker.rs | 98 +++++++++++++++++++ 3 files changed, 239 insertions(+) create mode 100755 scripts/meta_optimize_distributed.sh create mode 100644 src/bin/paint_meta_opt_worker.rs diff --git a/Cargo.toml b/Cargo.toml index c913abd7..7c9c8355 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,3 +48,7 @@ path = "src/pipeline_bench.rs" [[bin]] name = "paint_opt_worker" path = "src/bin/paint_opt_worker.rs" + +[[bin]] +name = "paint_meta_opt_worker" +path = "src/bin/paint_meta_opt_worker.rs" diff --git a/scripts/meta_optimize_distributed.sh b/scripts/meta_optimize_distributed.sh new file mode 100755 index 00000000..1e149ead --- /dev/null +++ b/scripts/meta_optimize_distributed.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# Distributed meta-optimizer. +# +# Splits N outer ScoreWeights samples between THIS machine and a remote +# SSH host, runs each as `paint_meta_opt_worker N --inner I --passes P`, +# collects all `MetaResult`s, lexicographically sorts (matches the +# in-process `compare_reports`), and prints the best result + a top-5 +# table. +# +# Each outer sample = build a random ScoreWeights candidate at index N +# → run the FULL inner optimizer under those weights → score the result. +# Outer samples are independent → embarrassingly parallel across hosts. +# +# Usage: +# scripts/meta_optimize_distributed.sh [N] [I] [P] [REMOTE] +# N = total outer samples (default 16) +# I = inner starts per outer (default 16) +# P = inner passes (default 4) +# REMOTE = "user@host:/path/to/repo" (default $REMOTE_TRAC3R) +# +# Splits via LOCAL_FRAC env var (default 0.4 = 40 % local, 60 % remote). +# Tune for unequal cores; with 14-core local + 24-core remote, 0.37 is +# proportional. + +set -euo pipefail + +N="${1:-16}" +I="${2:-16}" +P="${3:-4}" +REMOTE="${4:-${REMOTE_TRAC3R:-}}" +LOCAL_FRAC="${LOCAL_FRAC:-0.37}" + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +TMPDIR="$(mktemp -d -t meta-distrib.XXXXXX)" +trap 'rm -rf "$TMPDIR"' EXIT + +echo "[orch] meta-opt: $N outer × $I inner × $P passes" >&2 +echo "[orch] root=$ROOT remote=$REMOTE" >&2 + +LOCAL_N=$(awk -v n="$N" -v f="$LOCAL_FRAC" 'BEGIN{ printf "%d", int(n*f + 0.5) }') +REMOTE_N=$((N - LOCAL_N)) +[[ -z "$REMOTE" ]] && { LOCAL_N="$N"; REMOTE_N=0; } +echo "[orch] split: $LOCAL_N local, $REMOTE_N remote" >&2 + +# Build local first. +echo "[orch] cargo build --release --bin paint_meta_opt_worker (local)…" >&2 +( cd "$ROOT" && cargo build --release --bin paint_meta_opt_worker ) >&2 + +# Build remote in parallel (login shell so cargo is on PATH). +REMOTE_BUILD_PID="" +if [[ -n "$REMOTE" ]]; then + HOST="${REMOTE%%:*}" + RPATH="${REMOTE#*:}" + echo "[orch] cargo build --release on remote ($HOST:$RPATH)…" >&2 + ( ssh "$HOST" "bash -lc 'cd \"$RPATH\" && cargo build --release --bin paint_meta_opt_worker'" >&2 ) & + REMOTE_BUILD_PID=$! +fi + +# Concurrency policy: each meta-worker already saturates rayon +# internally (par_iter inner starts × par_iter corpus). Running TWO +# in parallel on the same box doubles thread pressure with no gain +# and probably some loss. So: at most one local + one remote at a time. +# +# Use xargs -P1 to serialize on each host. The two HOSTS run in +# parallel because we kick them off as separate background pipelines. + +LOCAL_OUT="$TMPDIR/local.json" +: > "$LOCAL_OUT" +LOCAL_PID="" +if (( LOCAL_N > 0 )); then + echo "[orch] dispatching $LOCAL_N samples to local (serial)…" >&2 + ( + for i in $(seq 0 $((LOCAL_N - 1))); do + "$ROOT/target/release/paint_meta_opt_worker" "$i" --inner "$I" --passes "$P" + done + ) >> "$LOCAL_OUT" & + LOCAL_PID=$! +fi + +REMOTE_OUT="$TMPDIR/remote.json" +: > "$REMOTE_OUT" +REMOTE_PID="" +if [[ -n "$REMOTE" && "$REMOTE_N" -gt 0 ]]; then + [[ -n "$REMOTE_BUILD_PID" ]] && wait "$REMOTE_BUILD_PID" + HOST="${REMOTE%%:*}" + RPATH="${REMOTE#*:}" + echo "[orch] dispatching $REMOTE_N samples to $HOST (serial on remote)…" >&2 + ( + seq "$LOCAL_N" $((N - 1)) | \ + ssh "$HOST" "bash -lc 'cd \"$RPATH\" && while read -r i; do ./target/release/paint_meta_opt_worker \$i --inner $I --passes $P; done'" + ) >> "$REMOTE_OUT" & + REMOTE_PID=$! +fi + +[[ -n "$LOCAL_PID" ]] && wait "$LOCAL_PID" +[[ -n "$REMOTE_PID" ]] && wait "$REMOTE_PID" + +ALL="$TMPDIR/all.json" +cat "$LOCAL_OUT" "$REMOTE_OUT" > "$ALL" +LINES=$(wc -l < "$ALL" | tr -d ' ') +echo "[orch] collected $LINES results" >&2 +if [[ "$LINES" -ne "$N" ]]; then + echo "[orch] WARNING: expected $N, got $LINES" >&2 +fi + +# Lex-sort matching CorpusReport's compare_reports: +# tier-1: fail_coverage, fail_bg, fail_single_stroke, fail_two_stroke, fail_length_budget +# tier-2: total_bg, total_strokes, total_unpainted_density, total_repaint, total_length +echo "" >&2 +echo "[orch] top 5 by lex order:" >&2 +jq -s 'sort_by([ + .report.fail_coverage, + .report.fail_bg, + .report.fail_single_stroke, + .report.fail_two_stroke, + .report.fail_length_budget, + .report.total_bg, + .report.total_strokes, + .report.total_unpainted_density, + .report.total_repaint, + .report.total_length +]) | .[0:5] | .[] | "idx=\(.idx) T1[cov=\(.report.fail_coverage) bg=\(.report.fail_bg) 1stk=\(.report.fail_single_stroke) 2stk=\(.report.fail_two_stroke) len=\(.report.fail_length_budget)] T2[bg=\(.report.total_bg) stk=\(.report.total_strokes) dens=\(.report.total_unpainted_density|round) rep=\(.report.total_repaint) len=\(.report.total_length|round)]"' -r "$ALL" >&2 + +echo "" >&2 +echo "[orch] best (full JSON on stdout):" >&2 +jq -s 'sort_by([ + .report.fail_coverage, + .report.fail_bg, + .report.fail_single_stroke, + .report.fail_two_stroke, + .report.fail_length_budget, + .report.total_bg, + .report.total_strokes, + .report.total_unpainted_density, + .report.total_repaint, + .report.total_length +]) | .[0]' "$ALL" diff --git a/src/bin/paint_meta_opt_worker.rs b/src/bin/paint_meta_opt_worker.rs new file mode 100644 index 00000000..b3314f62 --- /dev/null +++ b/src/bin/paint_meta_opt_worker.rs @@ -0,0 +1,98 @@ +//! Meta-optimizer worker — runs ONE outer sample of the meta search. +//! Builds the ScoreWeights for index N, runs the full inner optimizer +//! under those weights, evaluates the result against the corpus, and +//! prints a JSON `MetaResult` to stdout. Lets the meta-search be +//! sharded across SSH-reachable machines: each box runs +//! `paint_meta_opt_worker N` for its assigned indices in parallel, +//! orchestrator collects + lex-sorts. +//! +//! Usage: +//! paint_meta_opt_worker [--inner N] [--passes K] +//! +//! Output (stdout): `{ "idx", "weights", "params", "report" }` (JSON). +//! Stderr: human-readable progress; never parse it. + +use std::env; +use std::process::ExitCode; +use trac3r_lib::brush_paint::PaintParams; +use trac3r_lib::brush_paint_opt::{ + build_meta_weights, build_corpus, default_axes, + evaluate_score_weights, MetaResult, +}; + +fn parse_args() -> Result<(usize, usize, u32), String> { + let argv: Vec = env::args().collect(); + if argv.len() < 2 { + return Err(format!( + "usage: {} [--inner N] [--passes K]\n\ + outer_idx is the meta-optimizer's outer index (0..K-1).\n\ + inner defaults to 16, passes to 4.", + argv.first().cloned().unwrap_or_else(|| "paint_meta_opt_worker".to_string()) + )); + } + let outer_idx: usize = argv[1].parse() + .map_err(|e| format!("outer_idx must be a non-negative integer: {e}"))?; + let mut inner: usize = 16; + let mut passes: u32 = 4; + let mut i = 2; + while i < argv.len() { + match argv[i].as_str() { + "--inner" => { + i += 1; + inner = argv.get(i).ok_or("--inner requires a value")? + .parse().map_err(|e| format!("--inner value invalid: {e}"))?; + } + "--passes" => { + i += 1; + passes = argv.get(i).ok_or("--passes requires a value")? + .parse().map_err(|e| format!("--passes value invalid: {e}"))?; + } + other => return Err(format!("unknown arg: {other}")), + } + i += 1; + } + Ok((outer_idx, inner, passes)) +} + +fn main() -> ExitCode { + let (outer_idx, n_inner, n_passes) = match parse_args() { + Ok(t) => t, + Err(e) => { eprintln!("{e}"); return ExitCode::from(2); } + }; + + let host = hostname(); + let cores = std::thread::available_parallelism().map(|n| n.get()).unwrap_or(0); + eprintln!("[meta-worker {host}/{cores}t] outer_idx={outer_idx} inner={n_inner} passes={n_passes}"); + + let t0 = std::time::Instant::now(); + let weights = build_meta_weights(outer_idx); + let corpus = build_corpus(); + let axes = default_axes(); + let base = PaintParams::default(); + let (params, report) = evaluate_score_weights( + &weights, &corpus, &axes, &base, n_inner, n_passes + ); + let elapsed = t0.elapsed(); + eprintln!( + "[meta-worker {host}] done idx={} elapsed={:.1}s {}", + outer_idx, elapsed.as_secs_f64(), report.summary() + ); + + let result = MetaResult { idx: outer_idx, weights, params, report }; + match serde_json::to_string(&result) { + Ok(json) => { println!("{json}"); ExitCode::SUCCESS } + Err(e) => { eprintln!("[meta-worker {host}] JSON serialize failed: {e}"); ExitCode::from(3) } + } +} + +fn hostname() -> String { + std::env::var("HOSTNAME") + .or_else(|_| std::env::var("HOST")) + .unwrap_or_else(|_| { + std::process::Command::new("hostname") + .output().ok() + .and_then(|o| String::from_utf8(o.stdout).ok()) + .map(|s| s.trim().to_string()) + .unwrap_or_else(|| "?".to_string()) + }) +}