Commit c2d6eedb by PLN (Algolia)

feat(unwrapped): bake explorable per-sample feature dataset (#81)

Joins per-file features × resolver family/kind × standardized PCA projection
(PC1-5) × KMeans timbral cluster × audition .wav path into one unwrapped.json
the browser explores without recomputing. Reuses feature_eda.load_matrix so the
matrix matches the MDA exactly. Names the 5 leading PCs as superfeature axes
(Brightness / Timbre / Loudness / Envelope kick-bass / Tonal-noisy), orienting
coords by a stable anchor so poles read right. Contract tests green.
parent 9134398c
dist/
__pycache__/
# machine-local audition symlink → Dirt-Samples (recreated by build_unwrapped.py)
_samples
"""Contract tests for the Unwrapped viz dataset (build_unwrapped).
The heavy math (matrix, PCA, clustering) is feature_eda's and tested there; here we
guard the BAKING contract the browser depends on — every sample carries the fields the
scatter reads, families resolve to the fleet palette, clusters are in range, and the PC
naming heuristic picks the right human name + sign from a known loading.
"""
import json
from pathlib import Path
import pytest
import build_unwrapped as BU
HERE = Path(__file__).resolve().parent.parent
OUT = HERE / "unwrapped.json"
def test_name_pc_picks_envelope_and_orients_punchy_low():
# temporal_centroid + decay dominate, loading NEGATIVE → must flip so the named
# 'sustained/held' pole stays the +high end (stable sign across re-runs).
loadings = [("temporal_centroid", -0.5), ("duration_s", -0.4),
("decay_slope_db_s", -0.3), ("spectral_centroid", 0.05)]
name, lo, hi, flip = BU._name_pc(loadings)
assert name == "Envelope (kick ↔ bass)"
assert flip is True # negative anchor → flip
assert "punchy" in lo and "sustained" in hi
def test_name_pc_brightness_positive_no_flip():
loadings = [("spectral_centroid", 0.4), ("spectral_rolloff", 0.35),
("spectral_spread", 0.3), ("mfcc_1", -0.2)]
name, lo, hi, flip = BU._name_pc(loadings)
assert name == "Brightness"
assert flip is False
def test_name_pc_falls_back_to_generic():
name, lo, hi, flip = BU._name_pc([("f0_median", 0.9), ("zcr", 0.1)])
assert name == "PC" # no 2-of-N family matched
@pytest.mark.skipif(not OUT.exists(), reason="unwrapped.json not built (run build_unwrapped.py)")
def test_output_contract():
d = json.loads(OUT.read_text())
fams = set(d["families"])
k = d["n_clusters"]
assert len(d["samples"]) == d["headline"]["n_files"]
assert len(d["pc_axes"]) == 5
seen_fam = set()
for s in d["samples"]:
assert len(s["pc"]) == 5 and all(isinstance(x, (int, float)) for x in s["pc"])
assert 0 <= s["cluster"] < k
assert s["family"] is None or s["family"] in fams
assert isinstance(s["feat"], dict) and "spectral_centroid" in s["feat"]
if "wav" in s:
assert s["wav"].startswith("_samples/")
if s["family"]:
seen_fam.add(s["family"])
# contingency keys are exactly the labelled families
assert set(d["contingency"]) == seen_fam
# every PC axis has poles + loadings, explained variance in (0,1)
for a in d["pc_axes"]:
assert a["lo"] and a["hi"] and a["loadings"]
assert 0 < a["explained"] < 1
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment