feat(unwrapped): bake explorable per-sample feature dataset (#81)

Joins per-file features × resolver family/kind × standardized PCA projection (PC1-5) × KMeans timbral cluster × audition .wav path into one unwrapped.json the browser explores without recomputing. Reuses feature_eda.load_matrix so the matrix matches the MDA exactly. Names the 5 leading PCs as superfeature axes (Brightness / Timbre / Loudness / Envelope kick-bass / Tonal-noisy), orienting coords by a stable anchor so poles read right. Contract tests green.

feat(unwrapped): bake explorable per-sample feature dataset (#81)
Joins per-file features × resolver family/kind × standardized PCA projection (PC1-5) × KMeans timbral cluster × audition .wav path into one unwrapped.json the browser explores without recomputing. Reuses feature_eda.load_matrix so the matrix matches the MDA exactly. Names the 5 leading PCs as superfeature axes (Brightness / Timbre / Loudness / Envelope kick-bass / Tonal-noisy), orienting coords by a stable anchor so poles read right. Contract tests green.
c2d6eedb · PLN (Algolia) · 9134398c · c2d6eedb · c2d6eedb · c2d6eedb
Commit c2d6eedb authored Jun 07, 2026 by PLN (Algolia)
4 changed files
--- a/armada/tide-table/.gitignore
+++ b/armada/tide-table/.gitignore
 dist/
 __pycache__/
+# machine-local audition symlink → Dirt-Samples (recreated by build_unwrapped.py)
+_samples
--- a/armada/tide-table/build_unwrapped.py
+++ b/armada/tide-table/build_unwrapped.py
+#!/usr/bin/env python3
+"""build_unwrapped — bake the explorable 'ParVagues Unwrapped' viz dataset.
+
+The MDA (feature_eda) found the story; this lays it out as ONE per-sample table the
+browser can explore without recomputing anything. We reuse feature_eda.load_matrix so
+the rows, columns and median-imputation match the analysis EXACTLY (no second source of
+truth for the matrix), then add the three things a scatter needs that the JSON findings
+don't carry per file: the PCA projection (PC1..5 coords), the KMeans timbral-cluster id,
+and an audition path to the actual .wav.
+
+Per-file row = {folder, name, family, kind, agrees, cluster, pc[5], feat{…}, wav}.
+Plus: the 5 named SUPERFEATURE axes (PC name + poles + loadings), a shortlist of raw
+axes (centroid/attack/…), the fleet family palette, the family×cluster contingency
+(the 'folders are loose' grid), RF importances, and headline MDA numbers.
+
+A folder is a LOOSE grouping ([[feedback_sample_grouping_loose]]): we color by per-file
+family by default and offer audio-cluster / kind / folder-agreement as alternate lenses
+so the mismatch between timbre and label is explorable, not hidden.
+
+    python3 build_unwrapped.py            # → unwrapped.json (+ _samples symlink)
+"""
+import json
+import sys
+from collections import Counter, defaultdict
+from datetime import date
+from pathlib import Path
+
+import numpy as np
+
+import feature_eda as FE
+import models as M
+import sample_meta as META
+
+HERE = Path(__file__).resolve().parent
+TOKENS = HERE.parent / "ui" / "src" / "tokens.json"
+FAMILIES = HERE / "sample_families.json"
+OUT = HERE / "unwrapped.json"
+SAMPLES_LINK = HERE / "_samples"          # relative symlink → Dirt-Samples (gitignored)
+
+# Raw features worth exposing as direct axes (music-aficionado language, not just PCs).
+RAW_AXES = [
+    ("spectral_centroid", "Brightness", "spectral centroid (Hz) — dull → bright", "Hz"),
+    ("temporal_centroid", "Attack ↔ Sustain", "energy-time centroid (s) — punchy → sustained", "s"),
+    ("rms_db", "Loudness", "RMS level (dBFS)", "dB"),
+    ("decay_slope_db_s", "Decay speed", "post-attack decay (dB/s) — fast → slow", "dB/s"),
+    ("spectral_flatness", "Tonal ↔ Noisy", "spectral flatness — pitched → noisy", ""),
+    ("pct_percussive", "Percussiveness", "HPSS percussive fraction", ""),
+    ("duration_s", "Length", "sample length (s)", "s"),
+    ("log_attack_time", "Attack time", "log attack time (log s)", "log s"),
+    ("zcr", "Zero-crossings", "zero-crossing rate — low → high/noisy", ""),
+    ("hnr", "Harmonicity", "harmonic-to-noise ratio (dB)", "dB"),
+    ("f0_median", "Pitch", "median fundamental (Hz), where pitched", "Hz"),
+]
+# Features we ship raw per-file (axis pool + tooltip), beyond the PC coords.
+RAW_KEEP = [a[0] for a in RAW_AXES]
+
+# Heuristic names for the leading PCs, chosen from which features dominate the loading.
+# Oriented so the dominant feature loads positive → stable sign across re-runs.
+PC_NAMING = [
+    (["spectral_centroid", "spectral_rolloff", "spectral_spread", "spectral_bandwidth"],
+     "Brightness", "dark / sub", "bright / airy"),
+    (["mfcc_5", "mfcc_6", "mfcc_7", "mfcc_8", "mfcc_4"],
+     "Timbre colour", "hollow", "full / vowel-like"),
+    (["rms_db", "peak_db", "spectral_flux", "mfcc_0"],
+     "Loudness / energy", "quiet", "loud / dense"),
+    (["temporal_centroid", "decay_slope_db_s", "log_attack_time", "duration_s", "sustain_ratio"],
+     "Envelope (kick ↔ bass)", "punchy / transient", "sustained / held"),
+    (["spectral_kurtosis", "chroma_entropy", "pct_percussive", "spectral_flatness", "hnr"],
+     "Tonal ↔ noisy", "pitched / clean", "noisy / textured"),
+]
+
+
+def _name_pc(loadings):
+    """Pick a human name for a PC from its top |loading| features; orient sign + poles."""
+    top = sorted(loadings, key=lambda lw: -abs(lw[1]))
+    topset = {f for f, _ in top[:6]}
+    for keys, name, lo, hi in PC_NAMING:
+        if sum(f in topset for f in keys) >= 2:
+            # orient so the first matching key loads positive (stable sign across
+            # re-runs). When it loads negative we negate the COORDS (sign, in main),
+            # so the canonical lo/hi labels stay attached to the right pole — the
+            # labels never swap, the numbers do.
+            anchor = next((w for f, w in top if f in keys), top[0][1])
+            flip = anchor < 0
+            return name, lo, hi, flip
+    return "PC", "low", "high", False
+
+
+def main():
+    X, names, rows = FE.load_matrix()
+    Z = FE._standardize(X)
+
+    # ── PCA projection (same matrix as the MDA) ────────────────────────────────
+    from sklearn.decomposition import PCA
+    from sklearn.cluster import KMeans
+    p = PCA().fit(Z)
+    proj = p.transform(Z)                              # n_files × n_components
+    ev = p.explained_variance_ratio_
+    cum = np.cumsum(ev)
+    n_pc = 5
+
+    # ── KMeans timbral clusters (k = #families, like the MDA) ──────────────────
+    fams_present = sorted({r["family"] for r in rows if r["family"]})
+    k = max(2, len(fams_present))
+    km = KMeans(n_clusters=k, n_init=10, random_state=0).fit(Z)
+    clusters = km.labels_
+
+    # ── fleet family palette (color + glyph) ───────────────────────────────────
+    tok = json.loads(TOKENS.read_text())
+    fam_pal = {f["key"]: {"label": f["label"], "color": f["base"], "glyph": f["glyph"]}
+               for f in tok["sample"]}
+
+    # ── per-folder kind / agreement + real .wav filenames ──────────────────────
+    famdoc = json.loads(FAMILIES.read_text())["families"]
+    folder_meta = {name: {"kind": v["kind"], "dominant": v["dominant"],
+                          "agrees": v["folder_agrees"]}
+                   for name, v in famdoc.items()}
+    wav_by = {}                                        # (folder, stem) → filename
+    for folder in {r["folder"] for r in rows}:
+        for f in META.folder_files(folder):
+            wav_by[(folder, f.stem)] = f.name
+
+    # ── assemble per-file rows ─────────────────────────────────────────────────
+    fidx = {n: i for i, n in enumerate(names)}
+    samples, ncluster = [], Counter()
+    contingency = defaultdict(Counter)                 # family → cluster → count
+    for i, r in enumerate(rows):
+        fam = r["family"]
+        fm = folder_meta.get(r["folder"], {})
+        wav = wav_by.get((r["folder"], r["file"]))
+        rec = {
+            "folder": r["folder"], "name": r["file"], "family": fam,
+            "kind": fm.get("kind"), "agrees": fm.get("agrees"),
+            "cluster": int(clusters[i]),
+            "pc": [round(float(proj[i, j]), 3) for j in range(n_pc)],
+            "feat": {kk: round(float(X[i, fidx[kk]]), 4)
+                     for kk in RAW_KEEP if kk in fidx},
+        }
+        if wav:
+            rec["wav"] = f"_samples/{r['folder']}/{wav}"
+        samples.append(rec)
+        ncluster[int(clusters[i])] += 1
+        if fam:
+            contingency[fam][int(clusters[i])] += 1
+
+    # ── named superfeature axes (PC pole language + loadings) ──────────────────
+    pc_axes = []
+    for j in range(n_pc):
+        comp = list(zip(names, p.components_[j]))
+        name, lo, hi, flip = _name_pc(comp)
+        sign = -1.0 if flip else 1.0
+        top = sorted(comp, key=lambda lw: -abs(lw[1]))[:6]
+        pc_axes.append({
+            "key": f"pc{j}", "label": f"{name}",
+            "explained": round(float(ev[j]), 4),
+            "lo": lo, "hi": hi,
+            "loadings": [{"f": f, "w": round(float(w * sign), 3)} for f, w in top],
+        })
+        # apply the orientation flip to the stored coords too, so hi/lo read right
+        if flip:
+            for s in samples:
+                s["pc"][j] = round(-s["pc"][j], 3)
+
+    # ── family × cluster contingency (the 'folders are loose' grid) ────────────
+    grid = {fam: {str(c): contingency[fam].get(c, 0) for c in range(k)}
+            for fam in fams_present}
+
+    # ── headline MDA numbers (recomputed-consistent + from feature_eda.json) ───
+    eda = json.loads((HERE / "feature_eda.json").read_text()) \
+        if (HERE / "feature_eda.json").exists() else {}
+    headline = {
+        "n_files": len(samples),
+        "n_features": len(names),
+        "n_families": len(fams_present),
+        "intrinsic_dim_90": int(np.searchsorted(cum, 0.90) + 1),
+        "intrinsic_dim_95": int(np.searchsorted(cum, 0.95) + 1),
+        "ari_vs_resolver": eda.get("clustering", {}).get("ari_vs_resolver"),
+        "nmi_vs_resolver": eda.get("clustering", {}).get("nmi_vs_resolver"),
+        "kmeans_k": k,
+        "explained_5pc": round(float(cum[n_pc - 1]), 3),
+    }
+    rf = eda.get("clustering", {}).get("top_family_features", [])
+    correlation = {"n_pairs": eda.get("correlation", {}).get("n_correlated_pairs"),
+                   "n_kept": eda.get("correlation", {}).get("n_kept"),
+                   "pruned": eda.get("correlation", {}).get("pruned", [])}
+
+    prov = M.Provenance(source=M.Source.derived,
+                        locator="build_unwrapped: features × resolver × PCA/KMeans",
+                        as_of=date.today()).model_dump(mode="json")
+
+    payload = {
+        "schema": "ParVagues Unwrapped — explorable per-sample feature space (#81)",
+        "provenance": prov,
+        "headline": headline,
+        "families": fam_pal,
+        "family_order": fams_present,
+        "n_clusters": k,
+        "cluster_sizes": {str(c): ncluster[c] for c in range(k)},
+        "pc_axes": pc_axes,
+        "raw_axes": [{"key": kk, "label": lbl, "desc": desc, "unit": unit}
+                     for kk, lbl, desc, unit in RAW_AXES if kk in fidx],
+        "rf_importance": rf,
+        "correlation": correlation,
+        "contingency": grid,
+        "samples": samples,
+    }
+    OUT.write_text(json.dumps(payload, ensure_ascii=False))
+    sz = OUT.stat().st_size / 1024
+    print(f"✓ {OUT.name}: {len(samples)} samples, {len(names)} feats, "
+          f"{k} clusters, {len(fams_present)} families ({sz:.0f} KB)")
+
+    # ── audition symlink (relative file:// audition; gitignored, machine-local) ─
+    if not SAMPLES_LINK.exists():
+        try:
+            SAMPLES_LINK.symlink_to(META.DIRT)
+            print(f"  + {SAMPLES_LINK.name} → {META.DIRT} (audition)")
+        except OSError as e:
+            print(f"  (no _samples symlink: {e} — audition disabled)", file=sys.stderr)
+    n_wav = sum(1 for s in samples if "wav" in s)
+    print(f"  {n_wav}/{len(samples)} samples auditionable")
+
+
+if __name__ == "__main__":
+    main()
--- a/armada/tide-table/tests/test_build_unwrapped.py
+++ b/armada/tide-table/tests/test_build_unwrapped.py
+"""Contract tests for the Unwrapped viz dataset (build_unwrapped).
+
+The heavy math (matrix, PCA, clustering) is feature_eda's and tested there; here we
+guard the BAKING contract the browser depends on — every sample carries the fields the
+scatter reads, families resolve to the fleet palette, clusters are in range, and the PC
+naming heuristic picks the right human name + sign from a known loading.
+"""
+import json
+from pathlib import Path
+
+import pytest
+
+import build_unwrapped as BU
+
+HERE = Path(__file__).resolve().parent.parent
+OUT = HERE / "unwrapped.json"
+
+
+def test_name_pc_picks_envelope_and_orients_punchy_low():
+    # temporal_centroid + decay dominate, loading NEGATIVE → must flip so the named
+    # 'sustained/held' pole stays the +high end (stable sign across re-runs).
+    loadings = [("temporal_centroid", -0.5), ("duration_s", -0.4),
+                ("decay_slope_db_s", -0.3), ("spectral_centroid", 0.05)]
+    name, lo, hi, flip = BU._name_pc(loadings)
+    assert name == "Envelope (kick ↔ bass)"
+    assert flip is True                      # negative anchor → flip
+    assert "punchy" in lo and "sustained" in hi
+
+
+def test_name_pc_brightness_positive_no_flip():
+    loadings = [("spectral_centroid", 0.4), ("spectral_rolloff", 0.35),
+                ("spectral_spread", 0.3), ("mfcc_1", -0.2)]
+    name, lo, hi, flip = BU._name_pc(loadings)
+    assert name == "Brightness"
+    assert flip is False
+
+
+def test_name_pc_falls_back_to_generic():
+    name, lo, hi, flip = BU._name_pc([("f0_median", 0.9), ("zcr", 0.1)])
+    assert name == "PC"                      # no 2-of-N family matched
+
+
+@pytest.mark.skipif(not OUT.exists(), reason="unwrapped.json not built (run build_unwrapped.py)")
+def test_output_contract():
+    d = json.loads(OUT.read_text())
+    fams = set(d["families"])
+    k = d["n_clusters"]
+    assert len(d["samples"]) == d["headline"]["n_files"]
+    assert len(d["pc_axes"]) == 5
+    seen_fam = set()
+    for s in d["samples"]:
+        assert len(s["pc"]) == 5 and all(isinstance(x, (int, float)) for x in s["pc"])
+        assert 0 <= s["cluster"] < k
+        assert s["family"] is None or s["family"] in fams
+        assert isinstance(s["feat"], dict) and "spectral_centroid" in s["feat"]
+        if "wav" in s:
+            assert s["wav"].startswith("_samples/")
+        if s["family"]:
+            seen_fam.add(s["family"])
+    # contingency keys are exactly the labelled families
+    assert set(d["contingency"]) == seen_fam
+    # every PC axis has poles + loadings, explained variance in (0,1)
+    for a in d["pc_axes"]:
+        assert a["lo"] and a["hi"] and a["loadings"]
+        assert 0 < a["explained"] < 1
--- a/armada/tide-table/unwrapped.json
+++ b/armada/tide-table/unwrapped.json