Commit c2d6eedb by PLN (Algolia)

feat(unwrapped): bake explorable per-sample feature dataset (#81)

Joins per-file features × resolver family/kind × standardized PCA projection
(PC1-5) × KMeans timbral cluster × audition .wav path into one unwrapped.json
the browser explores without recomputing. Reuses feature_eda.load_matrix so the
matrix matches the MDA exactly. Names the 5 leading PCs as superfeature axes
(Brightness / Timbre / Loudness / Envelope kick-bass / Tonal-noisy), orienting
coords by a stable anchor so poles read right. Contract tests green.
parent 9134398c
dist/
__pycache__/
# machine-local audition symlink → Dirt-Samples (recreated by build_unwrapped.py)
_samples
#!/usr/bin/env python3
"""build_unwrapped — bake the explorable 'ParVagues Unwrapped' viz dataset.
The MDA (feature_eda) found the story; this lays it out as ONE per-sample table the
browser can explore without recomputing anything. We reuse feature_eda.load_matrix so
the rows, columns and median-imputation match the analysis EXACTLY (no second source of
truth for the matrix), then add the three things a scatter needs that the JSON findings
don't carry per file: the PCA projection (PC1..5 coords), the KMeans timbral-cluster id,
and an audition path to the actual .wav.
Per-file row = {folder, name, family, kind, agrees, cluster, pc[5], feat{…}, wav}.
Plus: the 5 named SUPERFEATURE axes (PC name + poles + loadings), a shortlist of raw
axes (centroid/attack/…), the fleet family palette, the family×cluster contingency
(the 'folders are loose' grid), RF importances, and headline MDA numbers.
A folder is a LOOSE grouping ([[feedback_sample_grouping_loose]]): we color by per-file
family by default and offer audio-cluster / kind / folder-agreement as alternate lenses
so the mismatch between timbre and label is explorable, not hidden.
python3 build_unwrapped.py # → unwrapped.json (+ _samples symlink)
"""
import json
import sys
from collections import Counter, defaultdict
from datetime import date
from pathlib import Path
import numpy as np
import feature_eda as FE
import models as M
import sample_meta as META
HERE = Path(__file__).resolve().parent
TOKENS = HERE.parent / "ui" / "src" / "tokens.json"
FAMILIES = HERE / "sample_families.json"
OUT = HERE / "unwrapped.json"
SAMPLES_LINK = HERE / "_samples" # relative symlink → Dirt-Samples (gitignored)
# Raw features worth exposing as direct axes (music-aficionado language, not just PCs).
RAW_AXES = [
("spectral_centroid", "Brightness", "spectral centroid (Hz) — dull → bright", "Hz"),
("temporal_centroid", "Attack ↔ Sustain", "energy-time centroid (s) — punchy → sustained", "s"),
("rms_db", "Loudness", "RMS level (dBFS)", "dB"),
("decay_slope_db_s", "Decay speed", "post-attack decay (dB/s) — fast → slow", "dB/s"),
("spectral_flatness", "Tonal ↔ Noisy", "spectral flatness — pitched → noisy", ""),
("pct_percussive", "Percussiveness", "HPSS percussive fraction", ""),
("duration_s", "Length", "sample length (s)", "s"),
("log_attack_time", "Attack time", "log attack time (log s)", "log s"),
("zcr", "Zero-crossings", "zero-crossing rate — low → high/noisy", ""),
("hnr", "Harmonicity", "harmonic-to-noise ratio (dB)", "dB"),
("f0_median", "Pitch", "median fundamental (Hz), where pitched", "Hz"),
]
# Features we ship raw per-file (axis pool + tooltip), beyond the PC coords.
RAW_KEEP = [a[0] for a in RAW_AXES]
# Heuristic names for the leading PCs, chosen from which features dominate the loading.
# Oriented so the dominant feature loads positive → stable sign across re-runs.
PC_NAMING = [
(["spectral_centroid", "spectral_rolloff", "spectral_spread", "spectral_bandwidth"],
"Brightness", "dark / sub", "bright / airy"),
(["mfcc_5", "mfcc_6", "mfcc_7", "mfcc_8", "mfcc_4"],
"Timbre colour", "hollow", "full / vowel-like"),
(["rms_db", "peak_db", "spectral_flux", "mfcc_0"],
"Loudness / energy", "quiet", "loud / dense"),
(["temporal_centroid", "decay_slope_db_s", "log_attack_time", "duration_s", "sustain_ratio"],
"Envelope (kick ↔ bass)", "punchy / transient", "sustained / held"),
(["spectral_kurtosis", "chroma_entropy", "pct_percussive", "spectral_flatness", "hnr"],
"Tonal ↔ noisy", "pitched / clean", "noisy / textured"),
]
def _name_pc(loadings):
"""Pick a human name for a PC from its top |loading| features; orient sign + poles."""
top = sorted(loadings, key=lambda lw: -abs(lw[1]))
topset = {f for f, _ in top[:6]}
for keys, name, lo, hi in PC_NAMING:
if sum(f in topset for f in keys) >= 2:
# orient so the first matching key loads positive (stable sign across
# re-runs). When it loads negative we negate the COORDS (sign, in main),
# so the canonical lo/hi labels stay attached to the right pole — the
# labels never swap, the numbers do.
anchor = next((w for f, w in top if f in keys), top[0][1])
flip = anchor < 0
return name, lo, hi, flip
return "PC", "low", "high", False
def main():
X, names, rows = FE.load_matrix()
Z = FE._standardize(X)
# ── PCA projection (same matrix as the MDA) ────────────────────────────────
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
p = PCA().fit(Z)
proj = p.transform(Z) # n_files × n_components
ev = p.explained_variance_ratio_
cum = np.cumsum(ev)
n_pc = 5
# ── KMeans timbral clusters (k = #families, like the MDA) ──────────────────
fams_present = sorted({r["family"] for r in rows if r["family"]})
k = max(2, len(fams_present))
km = KMeans(n_clusters=k, n_init=10, random_state=0).fit(Z)
clusters = km.labels_
# ── fleet family palette (color + glyph) ───────────────────────────────────
tok = json.loads(TOKENS.read_text())
fam_pal = {f["key"]: {"label": f["label"], "color": f["base"], "glyph": f["glyph"]}
for f in tok["sample"]}
# ── per-folder kind / agreement + real .wav filenames ──────────────────────
famdoc = json.loads(FAMILIES.read_text())["families"]
folder_meta = {name: {"kind": v["kind"], "dominant": v["dominant"],
"agrees": v["folder_agrees"]}
for name, v in famdoc.items()}
wav_by = {} # (folder, stem) → filename
for folder in {r["folder"] for r in rows}:
for f in META.folder_files(folder):
wav_by[(folder, f.stem)] = f.name
# ── assemble per-file rows ─────────────────────────────────────────────────
fidx = {n: i for i, n in enumerate(names)}
samples, ncluster = [], Counter()
contingency = defaultdict(Counter) # family → cluster → count
for i, r in enumerate(rows):
fam = r["family"]
fm = folder_meta.get(r["folder"], {})
wav = wav_by.get((r["folder"], r["file"]))
rec = {
"folder": r["folder"], "name": r["file"], "family": fam,
"kind": fm.get("kind"), "agrees": fm.get("agrees"),
"cluster": int(clusters[i]),
"pc": [round(float(proj[i, j]), 3) for j in range(n_pc)],
"feat": {kk: round(float(X[i, fidx[kk]]), 4)
for kk in RAW_KEEP if kk in fidx},
}
if wav:
rec["wav"] = f"_samples/{r['folder']}/{wav}"
samples.append(rec)
ncluster[int(clusters[i])] += 1
if fam:
contingency[fam][int(clusters[i])] += 1
# ── named superfeature axes (PC pole language + loadings) ──────────────────
pc_axes = []
for j in range(n_pc):
comp = list(zip(names, p.components_[j]))
name, lo, hi, flip = _name_pc(comp)
sign = -1.0 if flip else 1.0
top = sorted(comp, key=lambda lw: -abs(lw[1]))[:6]
pc_axes.append({
"key": f"pc{j}", "label": f"{name}",
"explained": round(float(ev[j]), 4),
"lo": lo, "hi": hi,
"loadings": [{"f": f, "w": round(float(w * sign), 3)} for f, w in top],
})
# apply the orientation flip to the stored coords too, so hi/lo read right
if flip:
for s in samples:
s["pc"][j] = round(-s["pc"][j], 3)
# ── family × cluster contingency (the 'folders are loose' grid) ────────────
grid = {fam: {str(c): contingency[fam].get(c, 0) for c in range(k)}
for fam in fams_present}
# ── headline MDA numbers (recomputed-consistent + from feature_eda.json) ───
eda = json.loads((HERE / "feature_eda.json").read_text()) \
if (HERE / "feature_eda.json").exists() else {}
headline = {
"n_files": len(samples),
"n_features": len(names),
"n_families": len(fams_present),
"intrinsic_dim_90": int(np.searchsorted(cum, 0.90) + 1),
"intrinsic_dim_95": int(np.searchsorted(cum, 0.95) + 1),
"ari_vs_resolver": eda.get("clustering", {}).get("ari_vs_resolver"),
"nmi_vs_resolver": eda.get("clustering", {}).get("nmi_vs_resolver"),
"kmeans_k": k,
"explained_5pc": round(float(cum[n_pc - 1]), 3),
}
rf = eda.get("clustering", {}).get("top_family_features", [])
correlation = {"n_pairs": eda.get("correlation", {}).get("n_correlated_pairs"),
"n_kept": eda.get("correlation", {}).get("n_kept"),
"pruned": eda.get("correlation", {}).get("pruned", [])}
prov = M.Provenance(source=M.Source.derived,
locator="build_unwrapped: features × resolver × PCA/KMeans",
as_of=date.today()).model_dump(mode="json")
payload = {
"schema": "ParVagues Unwrapped — explorable per-sample feature space (#81)",
"provenance": prov,
"headline": headline,
"families": fam_pal,
"family_order": fams_present,
"n_clusters": k,
"cluster_sizes": {str(c): ncluster[c] for c in range(k)},
"pc_axes": pc_axes,
"raw_axes": [{"key": kk, "label": lbl, "desc": desc, "unit": unit}
for kk, lbl, desc, unit in RAW_AXES if kk in fidx],
"rf_importance": rf,
"correlation": correlation,
"contingency": grid,
"samples": samples,
}
OUT.write_text(json.dumps(payload, ensure_ascii=False))
sz = OUT.stat().st_size / 1024
print(f"✓ {OUT.name}: {len(samples)} samples, {len(names)} feats, "
f"{k} clusters, {len(fams_present)} families ({sz:.0f} KB)")
# ── audition symlink (relative file:// audition; gitignored, machine-local) ─
if not SAMPLES_LINK.exists():
try:
SAMPLES_LINK.symlink_to(META.DIRT)
print(f" + {SAMPLES_LINK.name} → {META.DIRT} (audition)")
except OSError as e:
print(f" (no _samples symlink: {e} — audition disabled)", file=sys.stderr)
n_wav = sum(1 for s in samples if "wav" in s)
print(f" {n_wav}/{len(samples)} samples auditionable")
if __name__ == "__main__":
main()
"""Contract tests for the Unwrapped viz dataset (build_unwrapped).
The heavy math (matrix, PCA, clustering) is feature_eda's and tested there; here we
guard the BAKING contract the browser depends on — every sample carries the fields the
scatter reads, families resolve to the fleet palette, clusters are in range, and the PC
naming heuristic picks the right human name + sign from a known loading.
"""
import json
from pathlib import Path
import pytest
import build_unwrapped as BU
HERE = Path(__file__).resolve().parent.parent
OUT = HERE / "unwrapped.json"
def test_name_pc_picks_envelope_and_orients_punchy_low():
# temporal_centroid + decay dominate, loading NEGATIVE → must flip so the named
# 'sustained/held' pole stays the +high end (stable sign across re-runs).
loadings = [("temporal_centroid", -0.5), ("duration_s", -0.4),
("decay_slope_db_s", -0.3), ("spectral_centroid", 0.05)]
name, lo, hi, flip = BU._name_pc(loadings)
assert name == "Envelope (kick ↔ bass)"
assert flip is True # negative anchor → flip
assert "punchy" in lo and "sustained" in hi
def test_name_pc_brightness_positive_no_flip():
loadings = [("spectral_centroid", 0.4), ("spectral_rolloff", 0.35),
("spectral_spread", 0.3), ("mfcc_1", -0.2)]
name, lo, hi, flip = BU._name_pc(loadings)
assert name == "Brightness"
assert flip is False
def test_name_pc_falls_back_to_generic():
name, lo, hi, flip = BU._name_pc([("f0_median", 0.9), ("zcr", 0.1)])
assert name == "PC" # no 2-of-N family matched
@pytest.mark.skipif(not OUT.exists(), reason="unwrapped.json not built (run build_unwrapped.py)")
def test_output_contract():
d = json.loads(OUT.read_text())
fams = set(d["families"])
k = d["n_clusters"]
assert len(d["samples"]) == d["headline"]["n_files"]
assert len(d["pc_axes"]) == 5
seen_fam = set()
for s in d["samples"]:
assert len(s["pc"]) == 5 and all(isinstance(x, (int, float)) for x in s["pc"])
assert 0 <= s["cluster"] < k
assert s["family"] is None or s["family"] in fams
assert isinstance(s["feat"], dict) and "spectral_centroid" in s["feat"]
if "wav" in s:
assert s["wav"].startswith("_samples/")
if s["family"]:
seen_fam.add(s["family"])
# contingency keys are exactly the labelled families
assert set(d["contingency"]) == seen_fam
# every PC axis has poles + loadings, explained variance in (0,1)
for a in d["pc_axes"]:
assert a["lo"] and a["hi"] and a["loadings"]
assert 0 < a["explained"] < 1
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment