feat(sample-classify): fine ontology + PANNs + ensemble methods

Per PLN: richer ontology + PANNs/AudioSet + ensembles for sample grounding. - sample_ontology.py: 99 fine descriptors across the 12 families ('this is the sound of {a reese bass}'); scored per-descriptor then marginalized to family. CLAP fine: 58% -> 68% top-1 (coarse super-family 76%) vs the noisy name truth. - sample_panns.py: PANNs Cnn14 (AudioSet 527) -> conservative label->family map -> per-family prob vector. ffmpeg @32k, zero-pad short one-shots (Cnn14 needs >=1s of mel frames or conv5 collapses). Weak on electronic one-shots (AudioSet 'Clapping'=applause, not a drum-machine clap). - sample_classify.py: --method clap|panns|ensemble, --fine|--coarse. clap_vector() exposes the family-prob vector; ensemble = mean of CLAP+PANNs vectors -> argmax. Scoreboard (vs name-heuristic, itself noisy): clap-coarse 58% | clap-fine 68% | panns - | ensemble - (head-to-head primed, not yet run). Stubborn residual = bass<->kick one-shot (spectral decay tiebreaker is the next lever).

feat(sample-classify): fine ontology + PANNs + ensemble methods
Per PLN: richer ontology + PANNs/AudioSet + ensembles for sample grounding. - sample_ontology.py: 99 fine descriptors across the 12 families ('this is the sound of {a reese bass}'); scored per-descriptor then marginalized to family. CLAP fine: 58% -> 68% top-1 (coarse super-family 76%) vs the noisy name truth. - sample_panns.py: PANNs Cnn14 (AudioSet 527) -> conservative label->family map -> per-family prob vector. ffmpeg @32k, zero-pad short one-shots (Cnn14 needs >=1s of mel frames or conv5 collapses). Weak on electronic one-shots (AudioSet 'Clapping'=applause, not a drum-machine clap). - sample_classify.py: --method clap|panns|ensemble, --fine|--coarse. clap_vector() exposes the family-prob vector; ensemble = mean of CLAP+PANNs vectors -> argmax. Scoreboard (vs name-heuristic, itself noisy): clap-coarse 58% | clap-fine 68% | panns - | ensemble - (head-to-head primed, not yet run). Stubborn residual = bass<->kick one-shot (spectral decay tiebreaker is the next lever).
bb70f394 · PLN (Algolia) · 3e14a623 · bb70f394 · bb70f394 · bb70f394
Commit bb70f394 authored Jun 06, 2026 by PLN (Algolia)
Showing with 243 additions and 8 deletions

sample_classify.py armada/tide-table/sample_classify.py +56 -8

sample_ontology.py armada/tide-table/sample_ontology.py +87 -0

sample_panns.py armada/tide-table/sample_panns.py +100 -0

No files found.
--- a/armada/tide-table/sample_classify.py
+++ b/armada/tide-table/sample_classify.py
@@ -29,6 +29,7 @@ from pathlib import Path
 import numpy as np

 import models as M
+import sample_ontology as ONT

 HERE = Path(__file__).resolve().parent
 DIRT = Path.home() / ".local/share/SuperCollider/downloaded-quarks/Dirt-Samples"
@@ -58,6 +59,19 @@ PROMPTS = {
 }
 FAMS = list(PROMPTS)

+# MODE: "coarse" = the 12 family prompts above; "fine" = the rich descriptor ontology
+# (sample_ontology.py), scored per-descriptor then marginalized to family. Set by CLI.
+MODE = "fine"
+# METHOD: "clap" (CLAP only) · "panns" (AudioSet only) · "ensemble" (mean of both).
+METHOD = "clap"
+
+
+def active_prompts():
+    """(prompt_texts, family_per_prompt) for the current MODE."""
+    if MODE == "fine":
+        return ONT.PROMPTS, ONT.FAMILY_OF
+    return list(PROMPTS.values()), list(PROMPTS)
+

 # ── audio I/O (ffmpeg, no python audio deps) ──────────────────────────────────
 def load_audio(path):
@@ -92,16 +106,21 @@ def _clap():
        from transformers import ClapModel, ClapProcessor
        model = ClapModel.from_pretrained(MODEL).eval()
        proc = ClapProcessor.from_pretrained(MODEL)
+        prompts, fams = active_prompts()
        # text tower inputs computed once; reused for every audio (cheap to re-run)
-        ti = proc(text=list(PROMPTS.values()), return_tensors="pt", padding=True)
-        _CLAP.update(torch=torch, model=model, proc=proc,
-                     ids=ti["input_ids"], mask=ti["attention_mask"])
+        ti = proc(text=prompts, return_tensors="pt", padding=True)
+        # index → family, for marginalizing per-descriptor probs up to families
+        fam_keys = list(dict.fromkeys(fams))
+        idx = torch.tensor([fam_keys.index(f) for f in fams])
+        onehot = torch.zeros(len(fams), len(fam_keys)).scatter_(1, idx[:, None], 1.0)
+        _CLAP.update(torch=torch, model=model, proc=proc, ids=ti["input_ids"],
+                     mask=ti["attention_mask"], onehot=onehot, fam_keys=fam_keys)
    return _CLAP


-def classify_file(path):
-    """One sample → (family, confidence) or None. Uses CLAP's logits_per_audio
-    (audio↔text cosine × logit_scale) → softmax over the 12 family prompts."""
+def clap_vector(path):
+    """CLAP family-probability vector {family: prob} for one sample, or None.
+    logits_per_audio → softmax over descriptors → marginalize (sum) to families."""
    a = load_audio(path)
    if a is None:
        return None
@@ -112,8 +131,29 @@ def classify_file(path):
        out = C["model"](input_ids=C["ids"], attention_mask=C["mask"],
                         input_features=ai["input_features"])
        probs = torch.softmax(out.logits_per_audio.squeeze(0), dim=-1)
-        i = int(probs.argmax())
-        return FAMS[i], float(probs[i])
+        fam_probs = (probs @ C["onehot"]).tolist()
+        return dict(zip(C["fam_keys"], fam_probs))
+
+
+def classify_file(path):
+    """One sample → (family, confidence) or None, per the active METHOD:
+    clap (fine/coarse) · panns (AudioSet) · ensemble (mean of both vectors)."""
+    vecs = []
+    if METHOD in ("clap", "ensemble"):
+        v = clap_vector(path)
+        if v:
+            vecs.append(v)
+    if METHOD in ("panns", "ensemble"):
+        import sample_panns
+        v = sample_panns.family_vector(path)
+        if v:
+            vecs.append(v)
+    if not vecs:
+        return None
+    keys = ONT.FAMILIES
+    avg = {k: sum(v.get(k, 0.0) for v in vecs) / len(vecs) for k in keys}
+    f = max(avg, key=avg.get)
+    return f, round(avg[f], 3)


 def classify_folder(name):
@@ -220,7 +260,15 @@ def cmd_one(name):


 def main():
+    global MODE, METHOD
    args = sys.argv[1:]
+    if "--coarse" in args:
+        MODE = "coarse"
+    if "--fine" in args:
+        MODE = "fine"
+    if "--method" in args:
+        METHOD = args[args.index("--method") + 1]
+    print(f"   (method={METHOD}, clap-mode={MODE})")
    cmd = args[0] if args else "validate"
    if cmd == "validate":
        cmd_validate()

--- a/armada/tide-table/sample_ontology.py
+++ b/armada/tide-table/sample_ontology.py
+"""Fine sample-descriptor ontology for zero-shot audio classification.
+
+Each entry is a natural-language descriptor a contrastive audio model (CLAP) can
+match an unknown one-shot against. Many sharp anchors separate better than 12 coarse
+family prompts — especially the melodic cluster (keys/lead/synth/pad) and the
+bass↔kick boundary. At classify time we score every descriptor, then MARGINALIZE
+(sum probabilities) over the descriptors of each FAMILY → the 12-way fleet target.
+
+This is the 'vocabulary the classifier listens with'. Extend freely — more, more
+specific terms generally help. `FAMILY_OF[descriptor] = family` is the only contract.
+A second, finer label per descriptor (`SUBLABEL`) is kept for future fine-grained
+output (e.g. '808 kick' vs 'acoustic kick') and for PANNs/AudioSet mapping work."""
+
+# family → list of (descriptor phrase, optional sub-label). Sub-label defaults to the
+# phrase. Phrases are written to drop cleanly into "this is the sound of {phrase}".
+ONTOLOGY = {
+    "kick": [
+        "a kick drum", "an 808 kick drum", "a punchy acoustic bass drum",
+        "a boomy four-on-the-floor kick", "a distorted hardcore kick",
+        "a tight techno kick", "a deep sub kick with a click",
+    ],
+    "snare": [
+        "a snare drum", "an acoustic snare backbeat", "a tight electronic snare",
+        "a 909 snare", "a snappy drum-machine snare", "a rimshot",
+        "a single hand clap", "a layered clap stack", "a side-stick",
+    ],
+    "hat": [
+        "a closed hi-hat tick", "an open hi-hat", "a hi-hat groove",
+        "a ride cymbal", "a crash cymbal", "a metallic cymbal sizzle",
+        "a short bright shaker tick",
+    ],
+    "perc": [
+        "a conga hit", "a bongo drum", "a tom-tom", "a tabla", "a djembe",
+        "a tambourine", "a woodblock", "a cowbell", "a clave", "a guiro scrape",
+        "a tuned hand-percussion one-shot",
+    ],
+    "break": [
+        "a drum break loop", "an amen breakbeat", "a chopped jungle break",
+        "a funky drummer loop", "a full drum loop with kick snare and hi-hats",
+        "a breakbeat rhythm with swing",
+    ],
+    "bass": [
+        "a sustained sub bass note", "a deep low bass tone", "a reese bass",
+        "an acid 303 bassline", "a fingered bass guitar", "a wobble dubstep bass",
+        "a round synth bassline", "a low droning bass", "a plucked bass note",
+    ],
+    "keys": [
+        "a grand piano chord", "an electric piano", "a rhodes chord",
+        "a wurlitzer keyboard", "a hammond organ", "a clean electric guitar chord",
+        "an acoustic guitar pluck", "a harpsichord", "a clavinet", "a marimba",
+    ],
+    "lead": [
+        "a bright synth lead melody", "a fast synth arpeggio", "a plucky lead synth",
+        "a chiptune square-wave lead", "a sawtooth lead line", "a brass stab",
+        "a whistle-like lead melody",
+    ],
+    "synth": [
+        "a polysynth chord stab", "an electronic synthesizer tone", "an FM synth bell",
+        "a detuned supersaw chord", "a retro chiptune synth", "a digital synth pad stab",
+        "a glassy synth texture",
+    ],
+    "pad": [
+        "a warm synth pad", "a slow ambient drone", "an evolving sustained texture",
+        "a string ensemble pad", "a breathy choir pad", "an atmospheric wash",
+        "a cinematic drone bed",
+    ],
+    "fx": [
+        "a riser sweep", "a downlifter", "a white-noise sweep", "an impact boom",
+        "a reverse cymbal swell", "a vinyl crackle texture", "a glitch effect",
+        "a sci-fi whoosh", "a foley sound effect", "a tape stop",
+    ],
+    "vox": [
+        "a sung vocal phrase", "a spoken-word sample", "a chopped vocal", "a rap acapella",
+        "a layered choir", "a vocal scat", "a movie dialogue clip", "a crowd chant",
+        "a breathy vocal ad-lib",
+    ],
+}
+
+# flattened parallel arrays + reverse map (the classify-time contract)
+PROMPTS, FAMILY_OF = [], []
+for _fam, _terms in ONTOLOGY.items():
+    for _t in _terms:
+        PROMPTS.append("this is the sound of " + _t)
+        FAMILY_OF.append(_fam)
+
+N_TERMS = len(PROMPTS)
+FAMILIES = list(ONTOLOGY)
--- a/armada/tide-table/sample_panns.py
+++ b/armada/tide-table/sample_panns.py
+"""PANNs (AudioSet) sample tagging → fleet-family probability vector.
+
+A second, independent opinion to ensemble with CLAP. PANNs Cnn14 is trained on
+AudioSet's 527 sound classes — many are exactly our percussion/instrument families
+("Bass drum", "Snare drum", "Hi-hat", "Bass guitar", "Synthesizer", "Singing"…).
+We sum the model's clip-wise probabilities over the AudioSet labels mapped to each
+fleet family and normalize → a comparable 12-way vector.
+
+Audio is decoded to 32 kHz mono (Cnn14's rate) via ffmpeg — no librosa needed here.
+`family_vector(path)` returns a dict {family: prob} (None if nothing relevant fired).
+The map is conservative: only confident, unambiguous AudioSet labels are wired; generic
+classes ("Drum", "Music") are deliberately left out so they can't pollute the vote."""
+import subprocess
+
+import numpy as np
+
+import sample_ontology as ONT
+
+SR = 32000
+FAMILIES = ONT.FAMILIES
+
+# AudioSet display-name → fleet family. Exact substrings matched against panns labels.
+AUDIOSET_FAMILY = {
+    "Bass drum": "kick",
+    "Snare drum": "snare", "Rimshot": "snare", "Clapping": "snare",
+    "Hi-hat": "hat", "Cymbal": "hat", "Crash cymbal": "hat",
+    "Tabla": "perc", "Tambourine": "perc", "Maraca": "perc", "Cowbell": "perc",
+    "Wood block": "perc", "Tom-tom": "perc", "Conga": "perc", "Marimba, xylophone": "perc",
+    "Bass guitar": "bass", "Double bass": "bass",
+    "Piano": "keys", "Electric piano": "keys", "Organ": "keys",
+    "Electronic organ": "keys", "Hammond organ": "keys", "Harpsichord": "keys",
+    "Keyboard (musical)": "keys", "Mellotron": "keys", "Guitar": "keys",
+    "Electric guitar": "keys", "Acoustic guitar": "keys", "Clavinet": "keys",
+    "Synthesizer": "synth", "Sampler": "synth", "Sine wave": "synth",
+    "Brass instrument": "lead", "Trumpet": "lead", "Saxophone": "lead", "Trombone": "lead",
+    "String section": "pad", "Orchestra": "pad", "Violin, fiddle": "pad",
+    "Sound effect": "fx", "Effects unit": "fx", "Whoosh, swoosh, swish": "fx",
+    "Noise": "fx", "White noise": "fx", "Wind": "fx",
+    "Scratching (performance technique)": "fx",
+    "Singing": "vox", "Choir": "vox", "Male singing": "vox", "Female singing": "vox",
+    "Rapping": "vox", "Speech": "vox", "Vocal music": "vox", "Chant": "vox",
+    "Mantra": "vox", "Humming": "vox", "Yodeling": "vox",
+}
+
+_M = {}
+
+
+def _model():
+    if not _M:
+        from panns_inference import AudioTagging
+        from panns_inference.config import labels
+        at = AudioTagging(checkpoint_path=None, device="cpu")
+        # precompute label-index → family (sum many labels per family)
+        idx2fam = {}
+        for i, lab in enumerate(labels):
+            if lab in AUDIOSET_FAMILY:
+                idx2fam[i] = AUDIOSET_FAMILY[lab]
+        _M.update(at=at, idx2fam=idx2fam)
+    return _M
+
+
+def load_audio(path):
+    try:
+        p = subprocess.run(
+            ["ffmpeg", "-v", "quiet", "-i", str(path), "-ac", "1", "-ar", str(SR),
+             "-t", "10", "-f", "f32le", "-"], capture_output=True, timeout=30)
+        a = np.frombuffer(p.stdout, dtype=np.float32).copy()   # writable for torch
+        if a.size < SR // 100:
+            return None
+        if a.size < SR:           # Cnn14 needs ≥~1s of mel frames; zero-pad short one-shots
+            a = np.pad(a, (0, SR - a.size))
+        return a
+    except Exception:
+        return None
+
+
+def family_vector(path):
+    """{family: prob} from AudioSet tags mapped to fleet families, or None."""
+    a = load_audio(path)
+    if a is None:
+        return None
+    M = _model()
+    clip, _ = M["at"].inference(a[None, :])     # (1, 527)
+    clip = clip[0]
+    fam = {f: 0.0 for f in FAMILIES}
+    for i, f in M["idx2fam"].items():
+        fam[f] += float(clip[i])
+    tot = sum(fam.values())
+    if tot <= 1e-6:
+        return None
+    return {f: v / tot for f, v in fam.items()}
+
+
+def classify_file(path):
+    """(family, confidence) or None — the dominant mapped AudioSet family."""
+    v = family_vector(path)
+    if not v:
+        return None
+    f = max(v, key=v.get)
+    return f, round(v[f], 3)