Commit 01b88ed5 by PLN (Algolia)

feat(feature_eda): MDA over 1485 samples → the superfeature axes

sample_features.py overfetched 36 L0/L1 features × 1485 corpus samples; feature_eda
mines them three ways:
- correlation: only 2 redundant pairs ≥0.9 (duration~temporal_centroid 0.97,
  bandwidth~rolloff 0.91) → the overfetch was lean, 34/36 independent.
- PCA: intrinsic dim 19 (90%) / 24 (95%) — genuinely high-D. The 5 leading PCs are
  interpretable SUPERFEATURE AXES: PC1 brightness (rolloff/centroid), PC2 timbre
  (mfcc5-8), PC3 loudness (rms/peak/flux), PC4 envelope/time (temporal_centroid,
  decay_slope, attack — the kickbass axis), PC5 tonal-vs-noisy (kurtosis/chroma_entropy).
- clustering: KMeans(12) vs resolver families ARI=0.25 NMI=0.40 (timbral clusters
  partly orthogonal to semantic family — consistent with 'folders are loose').
  RF importance: spectral_centroid + temporal_centroid are the #1/#2 family
  discriminators → validates productizing the kickbass tiebreaker (#80).
TDD: 3 synthetic invariants (redundancy/dim/separation) + real-data load guard.
parent a93c47af
{
"schema": "feature MDA (correlation prune + PCA + clustering)",
"n_files": 1485,
"n_features": 36,
"correlation": {
"thresh": 0.9,
"n_correlated_pairs": 2,
"top_pairs": [
{
"a": "duration_s",
"b": "temporal_centroid",
"r": 0.969
},
{
"a": "spectral_bandwidth",
"b": "spectral_rolloff",
"r": 0.906
}
],
"pruned": [
{
"drop": "spectral_rolloff",
"kept": "spectral_bandwidth",
"r": 0.906
},
{
"drop": "temporal_centroid",
"kept": "duration_s",
"r": 0.969
}
],
"kept": [
"chroma_entropy",
"crest_db",
"decay_slope_db_s",
"duration_s",
"f0_median",
"hnr",
"key_strength",
"log_attack_time",
"mfcc_0",
"mfcc_1",
"mfcc_10",
"mfcc_11",
"mfcc_12",
"mfcc_2",
"mfcc_3",
"mfcc_4",
"mfcc_5",
"mfcc_6",
"mfcc_7",
"mfcc_8",
"mfcc_9",
"pct_percussive",
"peak_db",
"rms_db",
"spectral_bandwidth",
"spectral_centroid",
"spectral_contrast",
"spectral_flatness",
"spectral_flux",
"spectral_kurtosis",
"spectral_skew",
"spectral_spread",
"sustain_ratio",
"zcr"
],
"n_kept": 34
},
"pca": {
"n_features": 36,
"intrinsic_dim_90pct": 19,
"intrinsic_dim_95pct": 24,
"explained_variance_ratio": [
0.2017,
0.1822,
0.0901,
0.0608,
0.0497,
0.0447,
0.0401,
0.0342,
0.0295,
0.0269,
0.0229,
0.022
],
"components": [
{
"pc": 1,
"explained": 0.202,
"top_loadings": [
{
"f": "spectral_rolloff",
"w": 0.297
},
{
"f": "spectral_spread",
"w": 0.289
},
{
"f": "mfcc_1",
"w": -0.285
},
{
"f": "spectral_centroid",
"w": 0.271
},
{
"f": "zcr",
"w": 0.262
},
{
"f": "pct_percussive",
"w": 0.259
}
]
},
{
"pc": 2,
"explained": 0.182,
"top_loadings": [
{
"f": "mfcc_6",
"w": 0.3
},
{
"f": "mfcc_5",
"w": 0.3
},
{
"f": "mfcc_8",
"w": 0.3
},
{
"f": "mfcc_7",
"w": 0.28
},
{
"f": "mfcc_10",
"w": 0.269
},
{
"f": "mfcc_9",
"w": 0.263
}
]
},
{
"pc": 3,
"explained": 0.09,
"top_loadings": [
{
"f": "mfcc_0",
"w": 0.491
},
{
"f": "rms_db",
"w": 0.406
},
{
"f": "spectral_flux",
"w": 0.356
},
{
"f": "peak_db",
"w": 0.35
},
{
"f": "crest_db",
"w": -0.293
},
{
"f": "sustain_ratio",
"w": 0.258
}
]
},
{
"pc": 4,
"explained": 0.061,
"top_loadings": [
{
"f": "temporal_centroid",
"w": 0.351
},
{
"f": "duration_s",
"w": 0.347
},
{
"f": "decay_slope_db_s",
"w": 0.291
},
{
"f": "log_attack_time",
"w": 0.283
},
{
"f": "mfcc_10",
"w": 0.268
},
{
"f": "mfcc_11",
"w": 0.267
}
]
},
{
"pc": 5,
"explained": 0.05,
"top_loadings": [
{
"f": "spectral_kurtosis",
"w": 0.343
},
{
"f": "chroma_entropy",
"w": -0.308
},
{
"f": "spectral_skew",
"w": 0.269
},
{
"f": "pct_percussive",
"w": -0.239
},
{
"f": "mfcc_3",
"w": -0.238
},
{
"f": "temporal_centroid",
"w": -0.233
}
]
}
]
},
"clustering": {
"n_labelled": 825,
"kmeans_k": 12,
"ari_vs_resolver": 0.253,
"nmi_vs_resolver": 0.4,
"family_distribution": {
"vox": 146,
"bass": 104,
"keys": 98,
"kick": 84,
"hat": 71,
"snare": 70,
"break": 54,
"fx": 51,
"lead": 44,
"pad": 38,
"synth": 35,
"perc": 30
},
"rf_oob_note": "importances on standardized per-file features",
"top_family_features": [
{
"f": "spectral_centroid",
"importance": 0.0603
},
{
"f": "temporal_centroid",
"importance": 0.0556
},
{
"f": "spectral_contrast",
"importance": 0.0441
},
{
"f": "duration_s",
"importance": 0.0371
},
{
"f": "spectral_skew",
"importance": 0.0345
},
{
"f": "pct_percussive",
"importance": 0.033
},
{
"f": "hnr",
"importance": 0.0319
},
{
"f": "zcr",
"importance": 0.0314
},
{
"f": "spectral_bandwidth",
"importance": 0.0307
},
{
"f": "chroma_entropy",
"importance": 0.0307
},
{
"f": "spectral_rolloff",
"importance": 0.0294
},
{
"f": "mfcc_1",
"importance": 0.0284
},
{
"f": "mfcc_2",
"importance": 0.0283
},
{
"f": "spectral_flatness",
"importance": 0.0282
},
{
"f": "spectral_spread",
"importance": 0.0274
}
]
}
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
"""Deterministic UTs for the feature MDA logic (feature_eda).
We don't need the real corpus to trust the math: build a tiny synthetic matrix with
KNOWN structure (a redundant pair, a known intrinsic dimensionality, two separable
clusters) and assert each lens recovers it. Real-data validation is the `all` run;
this guards the analysis itself against silent regressions.
"""
import numpy as np
import pytest
import feature_eda as FE
def test_correlate_finds_and_prunes_redundant_pair():
rng = np.random.RandomState(0)
a = rng.randn(200)
X = np.column_stack([a, a * 2 + 1e-6 * rng.randn(200), # b ≈ 2a → redundant
rng.randn(200), rng.randn(200)]) # c, d independent
names = ["a", "b", "c", "d"]
r = FE.correlate(X, names, thresh=0.9)
assert r["n_correlated_pairs"] == 1
# exactly one of {a,b} pruned, the other kept; c,d both survive
pruned = {p["drop"] for p in r["pruned"]}
assert pruned in ({"a"}, {"b"})
assert "c" in r["kept"] and "d" in r["kept"]
assert r["n_kept"] == 3
def test_pca_recovers_low_intrinsic_dim():
rng = np.random.RandomState(1)
# 5 columns but only 2 real latent factors → ~2 intrinsic dims
f1, f2 = rng.randn(300), rng.randn(300)
X = np.column_stack([f1, f2, f1 + f2, f1 - f2, 0.5 * f1 + 0.3 * f2])
r = FE.pca(X, [f"x{i}" for i in range(5)])
assert r["intrinsic_dim_90pct"] <= 2
assert r["components"][0]["explained"] > 0
def test_cluster_separates_two_known_groups():
rng = np.random.RandomState(2)
n = 60
g0 = rng.randn(n, 4) + np.array([5, 5, 0, 0])
g1 = rng.randn(n, 4) + np.array([-5, -5, 0, 0])
X = np.vstack([g0, g1])
names = ["s0", "s1", "noise0", "noise1"]
rows = ([{"folder": "f", "file": str(i), "family": "kick"} for i in range(n)] +
[{"folder": "f", "file": str(i), "family": "bass"} for i in range(n)])
r = FE.cluster(X, names, rows)
assert r["n_labelled"] == 2 * n
assert r["ari_vs_resolver"] > 0.8 # cleanly separable → high agreement
# the separating features (s0/s1) must dominate importance over the noise dims
top2 = {x["f"] for x in r["top_family_features"][:2]}
assert top2 == {"s0", "s1"}
@pytest.mark.skipif(not FE.FEATURES.exists(),
reason="sample_features.json not built yet (run sample_features.py)")
def test_load_matrix_on_real_features():
X, names, rows = FE.load_matrix()
assert X.shape[0] > 0 and X.shape[1] > 0
assert not np.isnan(X).any() # median-imputation leaves no NaNs
assert len(names) == X.shape[1] and len(rows) == X.shape[0]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment