feat(feature_eda): MDA over 1485 samples → the superfeature axes

sample_features.py overfetched 36 L0/L1 features × 1485 corpus samples; feature_eda mines them three ways: - correlation: only 2 redundant pairs ≥0.9 (duration~temporal_centroid 0.97, bandwidth~rolloff 0.91) → the overfetch was lean, 34/36 independent. - PCA: intrinsic dim 19 (90%) / 24 (95%) — genuinely high-D. The 5 leading PCs are interpretable SUPERFEATURE AXES: PC1 brightness (rolloff/centroid), PC2 timbre (mfcc5-8), PC3 loudness (rms/peak/flux), PC4 envelope/time (temporal_centroid, decay_slope, attack — the kick↔bass axis), PC5 tonal-vs-noisy (kurtosis/chroma_entropy). - clustering: KMeans(12) vs resolver families ARI=0.25 NMI=0.40 (timbral clusters partly orthogonal to semantic family — consistent with 'folders are loose'). RF importance: spectral_centroid + temporal_centroid are the #1/#2 family discriminators → validates productizing the kick↔bass tiebreaker (#80). TDD: 3 synthetic invariants (redundancy/dim/separation) + real-data load guard.

feat(feature_eda): MDA over 1485 samples → the superfeature axes
sample_features.py overfetched 36 L0/L1 features × 1485 corpus samples; feature_eda mines them three ways: - correlation: only 2 redundant pairs ≥0.9 (duration~temporal_centroid 0.97, bandwidth~rolloff 0.91) → the overfetch was lean, 34/36 independent. - PCA: intrinsic dim 19 (90%) / 24 (95%) — genuinely high-D. The 5 leading PCs are interpretable SUPERFEATURE AXES: PC1 brightness (rolloff/centroid), PC2 timbre (mfcc5-8), PC3 loudness (rms/peak/flux), PC4 envelope/time (temporal_centroid, decay_slope, attack — the kick↔bass axis), PC5 tonal-vs-noisy (kurtosis/chroma_entropy). - clustering: KMeans(12) vs resolver families ARI=0.25 NMI=0.40 (timbral clusters partly orthogonal to semantic family — consistent with 'folders are loose'). RF importance: spectral_centroid + temporal_centroid are the #1/#2 family discriminators → validates productizing the kick↔bass tiebreaker (#80). TDD: 3 synthetic invariants (redundancy/dim/separation) + real-data load guard.
01b88ed5 · PLN (Algolia) · a93c47af · 01b88ed5 · 01b88ed5 · 01b88ed5
Commit 01b88ed5 authored Jun 07, 2026 by PLN (Algolia)
4 changed files
--- a/armada/tide-table/feature_eda.json
+++ b/armada/tide-table/feature_eda.json
+{
+ "schema": "feature MDA (correlation prune + PCA + clustering)",
+ "n_files": 1485,
+ "n_features": 36,
+ "correlation": {
+  "thresh": 0.9,
+  "n_correlated_pairs": 2,
+  "top_pairs": [
+   {
+    "a": "duration_s",
+    "b": "temporal_centroid",
+    "r": 0.969
+   },
+   {
+    "a": "spectral_bandwidth",
+    "b": "spectral_rolloff",
+    "r": 0.906
+   }
+  ],
+  "pruned": [
+   {
+    "drop": "spectral_rolloff",
+    "kept": "spectral_bandwidth",
+    "r": 0.906
+   },
+   {
+    "drop": "temporal_centroid",
+    "kept": "duration_s",
+    "r": 0.969
+   }
+  ],
+  "kept": [
+   "chroma_entropy",
+   "crest_db",
+   "decay_slope_db_s",
+   "duration_s",
+   "f0_median",
+   "hnr",
+   "key_strength",
+   "log_attack_time",
+   "mfcc_0",
+   "mfcc_1",
+   "mfcc_10",
+   "mfcc_11",
+   "mfcc_12",
+   "mfcc_2",
+   "mfcc_3",
+   "mfcc_4",
+   "mfcc_5",
+   "mfcc_6",
+   "mfcc_7",
+   "mfcc_8",
+   "mfcc_9",
+   "pct_percussive",
+   "peak_db",
+   "rms_db",
+   "spectral_bandwidth",
+   "spectral_centroid",
+   "spectral_contrast",
+   "spectral_flatness",
+   "spectral_flux",
+   "spectral_kurtosis",
+   "spectral_skew",
+   "spectral_spread",
+   "sustain_ratio",
+   "zcr"
+  ],
+  "n_kept": 34
+ },
+ "pca": {
+  "n_features": 36,
+  "intrinsic_dim_90pct": 19,
+  "intrinsic_dim_95pct": 24,
+  "explained_variance_ratio": [
+   0.2017,
+   0.1822,
+   0.0901,
+   0.0608,
+   0.0497,
+   0.0447,
+   0.0401,
+   0.0342,
+   0.0295,
+   0.0269,
+   0.0229,
+   0.022
+  ],
+  "components": [
+   {
+    "pc": 1,
+    "explained": 0.202,
+    "top_loadings": [
+     {
+      "f": "spectral_rolloff",
+      "w": 0.297
+     },
+     {
+      "f": "spectral_spread",
+      "w": 0.289
+     },
+     {
+      "f": "mfcc_1",
+      "w": -0.285
+     },
+     {
+      "f": "spectral_centroid",
+      "w": 0.271
+     },
+     {
+      "f": "zcr",
+      "w": 0.262
+     },
+     {
+      "f": "pct_percussive",
+      "w": 0.259
+     }
+    ]
+   },
+   {
+    "pc": 2,
+    "explained": 0.182,
+    "top_loadings": [
+     {
+      "f": "mfcc_6",
+      "w": 0.3
+     },
+     {
+      "f": "mfcc_5",
+      "w": 0.3
+     },
+     {
+      "f": "mfcc_8",
+      "w": 0.3
+     },
+     {
+      "f": "mfcc_7",
+      "w": 0.28
+     },
+     {
+      "f": "mfcc_10",
+      "w": 0.269
+     },
+     {
+      "f": "mfcc_9",
+      "w": 0.263
+     }
+    ]
+   },
+   {
+    "pc": 3,
+    "explained": 0.09,
+    "top_loadings": [
+     {
+      "f": "mfcc_0",
+      "w": 0.491
+     },
+     {
+      "f": "rms_db",
+      "w": 0.406
+     },
+     {
+      "f": "spectral_flux",
+      "w": 0.356
+     },
+     {
+      "f": "peak_db",
+      "w": 0.35
+     },
+     {
+      "f": "crest_db",
+      "w": -0.293
+     },
+     {
+      "f": "sustain_ratio",
+      "w": 0.258
+     }
+    ]
+   },
+   {
+    "pc": 4,
+    "explained": 0.061,
+    "top_loadings": [
+     {
+      "f": "temporal_centroid",
+      "w": 0.351
+     },
+     {
+      "f": "duration_s",
+      "w": 0.347
+     },
+     {
+      "f": "decay_slope_db_s",
+      "w": 0.291
+     },
+     {
+      "f": "log_attack_time",
+      "w": 0.283
+     },
+     {
+      "f": "mfcc_10",
+      "w": 0.268
+     },
+     {
+      "f": "mfcc_11",
+      "w": 0.267
+     }
+    ]
+   },
+   {
+    "pc": 5,
+    "explained": 0.05,
+    "top_loadings": [
+     {
+      "f": "spectral_kurtosis",
+      "w": 0.343
+     },
+     {
+      "f": "chroma_entropy",
+      "w": -0.308
+     },
+     {
+      "f": "spectral_skew",
+      "w": 0.269
+     },
+     {
+      "f": "pct_percussive",
+      "w": -0.239
+     },
+     {
+      "f": "mfcc_3",
+      "w": -0.238
+     },
+     {
+      "f": "temporal_centroid",
+      "w": -0.233
+     }
+    ]
+   }
+  ]
+ },
+ "clustering": {
+  "n_labelled": 825,
+  "kmeans_k": 12,
+  "ari_vs_resolver": 0.253,
+  "nmi_vs_resolver": 0.4,
+  "family_distribution": {
+   "vox": 146,
+   "bass": 104,
+   "keys": 98,
+   "kick": 84,
+   "hat": 71,
+   "snare": 70,
+   "break": 54,
+   "fx": 51,
+   "lead": 44,
+   "pad": 38,
+   "synth": 35,
+   "perc": 30
+  },
+  "rf_oob_note": "importances on standardized per-file features",
+  "top_family_features": [
+   {
+    "f": "spectral_centroid",
+    "importance": 0.0603
+   },
+   {
+    "f": "temporal_centroid",
+    "importance": 0.0556
+   },
+   {
+    "f": "spectral_contrast",
+    "importance": 0.0441
+   },
+   {
+    "f": "duration_s",
+    "importance": 0.0371
+   },
+   {
+    "f": "spectral_skew",
+    "importance": 0.0345
+   },
+   {
+    "f": "pct_percussive",
+    "importance": 0.033
+   },
+   {
+    "f": "hnr",
+    "importance": 0.0319
+   },
+   {
+    "f": "zcr",
+    "importance": 0.0314
+   },
+   {
+    "f": "spectral_bandwidth",
+    "importance": 0.0307
+   },
+   {
+    "f": "chroma_entropy",
+    "importance": 0.0307
+   },
+   {
+    "f": "spectral_rolloff",
+    "importance": 0.0294
+   },
+   {
+    "f": "mfcc_1",
+    "importance": 0.0284
+   },
+   {
+    "f": "mfcc_2",
+    "importance": 0.0283
+   },
+   {
+    "f": "spectral_flatness",
+    "importance": 0.0282
+   },
+   {
+    "f": "spectral_spread",
+    "importance": 0.0274
+   }
+  ]
+ }
+}
\ No newline at end of file
--- a/armada/tide-table/feature_eda.py
+++ b/armada/tide-table/feature_eda.py
--- a/armada/tide-table/sample_features.json
+++ b/armada/tide-table/sample_features.json
--- a/armada/tide-table/tests/test_feature_eda.py
+++ b/armada/tide-table/tests/test_feature_eda.py
+"""Deterministic UTs for the feature MDA logic (feature_eda).
+
+We don't need the real corpus to trust the math: build a tiny synthetic matrix with
+KNOWN structure (a redundant pair, a known intrinsic dimensionality, two separable
+clusters) and assert each lens recovers it. Real-data validation is the `all` run;
+this guards the analysis itself against silent regressions.
+"""
+import numpy as np
+import pytest
+
+import feature_eda as FE
+
+
+def test_correlate_finds_and_prunes_redundant_pair():
+    rng = np.random.RandomState(0)
+    a = rng.randn(200)
+    X = np.column_stack([a, a * 2 + 1e-6 * rng.randn(200),   # b ≈ 2a  → redundant
+                         rng.randn(200), rng.randn(200)])     # c, d independent
+    names = ["a", "b", "c", "d"]
+    r = FE.correlate(X, names, thresh=0.9)
+    assert r["n_correlated_pairs"] == 1
+    # exactly one of {a,b} pruned, the other kept; c,d both survive
+    pruned = {p["drop"] for p in r["pruned"]}
+    assert pruned in ({"a"}, {"b"})
+    assert "c" in r["kept"] and "d" in r["kept"]
+    assert r["n_kept"] == 3
+
+
+def test_pca_recovers_low_intrinsic_dim():
+    rng = np.random.RandomState(1)
+    # 5 columns but only 2 real latent factors → ~2 intrinsic dims
+    f1, f2 = rng.randn(300), rng.randn(300)
+    X = np.column_stack([f1, f2, f1 + f2, f1 - f2, 0.5 * f1 + 0.3 * f2])
+    r = FE.pca(X, [f"x{i}" for i in range(5)])
+    assert r["intrinsic_dim_90pct"] <= 2
+    assert r["components"][0]["explained"] > 0
+
+
+def test_cluster_separates_two_known_groups():
+    rng = np.random.RandomState(2)
+    n = 60
+    g0 = rng.randn(n, 4) + np.array([5, 5, 0, 0])
+    g1 = rng.randn(n, 4) + np.array([-5, -5, 0, 0])
+    X = np.vstack([g0, g1])
+    names = ["s0", "s1", "noise0", "noise1"]
+    rows = ([{"folder": "f", "file": str(i), "family": "kick"} for i in range(n)] +
+            [{"folder": "f", "file": str(i), "family": "bass"} for i in range(n)])
+    r = FE.cluster(X, names, rows)
+    assert r["n_labelled"] == 2 * n
+    assert r["ari_vs_resolver"] > 0.8            # cleanly separable → high agreement
+    # the separating features (s0/s1) must dominate importance over the noise dims
+    top2 = {x["f"] for x in r["top_family_features"][:2]}
+    assert top2 == {"s0", "s1"}
+
+
+@pytest.mark.skipif(not FE.FEATURES.exists(),
+                    reason="sample_features.json not built yet (run sample_features.py)")
+def test_load_matrix_on_real_features():
+    X, names, rows = FE.load_matrix()
+    assert X.shape[0] > 0 and X.shape[1] > 0
+    assert not np.isnan(X).any()                 # median-imputation leaves no NaNs
+    assert len(names) == X.shape[1] and len(rows) == X.shape[0]