Commit bb70f394 by PLN (Algolia)

feat(sample-classify): fine ontology + PANNs + ensemble methods

Per PLN: richer ontology + PANNs/AudioSet + ensembles for sample grounding.
- sample_ontology.py: 99 fine descriptors across the 12 families ('this is the
  sound of {a reese bass}'); scored per-descriptor then marginalized to family.
  CLAP fine: 58% -> 68% top-1 (coarse super-family 76%) vs the noisy name truth.
- sample_panns.py: PANNs Cnn14 (AudioSet 527) -> conservative label->family map ->
  per-family prob vector. ffmpeg @32k, zero-pad short one-shots (Cnn14 needs >=1s
  of mel frames or conv5 collapses). Weak on electronic one-shots (AudioSet
  'Clapping'=applause, not a drum-machine clap).
- sample_classify.py: --method clap|panns|ensemble, --fine|--coarse. clap_vector()
  exposes the family-prob vector; ensemble = mean of CLAP+PANNs vectors -> argmax.

Scoreboard (vs name-heuristic, itself noisy): clap-coarse 58% | clap-fine 68% |
panns - | ensemble - (head-to-head primed, not yet run). Stubborn residual =
bass<->kick one-shot (spectral decay tiebreaker is the next lever).
parent 3e14a623
...@@ -29,6 +29,7 @@ from pathlib import Path ...@@ -29,6 +29,7 @@ from pathlib import Path
import numpy as np import numpy as np
import models as M import models as M
import sample_ontology as ONT
HERE = Path(__file__).resolve().parent HERE = Path(__file__).resolve().parent
DIRT = Path.home() / ".local/share/SuperCollider/downloaded-quarks/Dirt-Samples" DIRT = Path.home() / ".local/share/SuperCollider/downloaded-quarks/Dirt-Samples"
...@@ -58,6 +59,19 @@ PROMPTS = { ...@@ -58,6 +59,19 @@ PROMPTS = {
} }
FAMS = list(PROMPTS) FAMS = list(PROMPTS)
# MODE: "coarse" = the 12 family prompts above; "fine" = the rich descriptor ontology
# (sample_ontology.py), scored per-descriptor then marginalized to family. Set by CLI.
MODE = "fine"
# METHOD: "clap" (CLAP only) · "panns" (AudioSet only) · "ensemble" (mean of both).
METHOD = "clap"
def active_prompts():
"""(prompt_texts, family_per_prompt) for the current MODE."""
if MODE == "fine":
return ONT.PROMPTS, ONT.FAMILY_OF
return list(PROMPTS.values()), list(PROMPTS)
# ── audio I/O (ffmpeg, no python audio deps) ────────────────────────────────── # ── audio I/O (ffmpeg, no python audio deps) ──────────────────────────────────
def load_audio(path): def load_audio(path):
...@@ -92,16 +106,21 @@ def _clap(): ...@@ -92,16 +106,21 @@ def _clap():
from transformers import ClapModel, ClapProcessor from transformers import ClapModel, ClapProcessor
model = ClapModel.from_pretrained(MODEL).eval() model = ClapModel.from_pretrained(MODEL).eval()
proc = ClapProcessor.from_pretrained(MODEL) proc = ClapProcessor.from_pretrained(MODEL)
prompts, fams = active_prompts()
# text tower inputs computed once; reused for every audio (cheap to re-run) # text tower inputs computed once; reused for every audio (cheap to re-run)
ti = proc(text=list(PROMPTS.values()), return_tensors="pt", padding=True) ti = proc(text=prompts, return_tensors="pt", padding=True)
_CLAP.update(torch=torch, model=model, proc=proc, # index → family, for marginalizing per-descriptor probs up to families
ids=ti["input_ids"], mask=ti["attention_mask"]) fam_keys = list(dict.fromkeys(fams))
idx = torch.tensor([fam_keys.index(f) for f in fams])
onehot = torch.zeros(len(fams), len(fam_keys)).scatter_(1, idx[:, None], 1.0)
_CLAP.update(torch=torch, model=model, proc=proc, ids=ti["input_ids"],
mask=ti["attention_mask"], onehot=onehot, fam_keys=fam_keys)
return _CLAP return _CLAP
def classify_file(path): def clap_vector(path):
"""One sample → (family, confidence) or None. Uses CLAP's logits_per_audio """CLAP family-probability vector {family: prob} for one sample, or None.
(audio↔text cosine × logit_scale) → softmax over the 12 family prompts.""" logits_per_audio → softmax over descriptors → marginalize (sum) to families."""
a = load_audio(path) a = load_audio(path)
if a is None: if a is None:
return None return None
...@@ -112,8 +131,29 @@ def classify_file(path): ...@@ -112,8 +131,29 @@ def classify_file(path):
out = C["model"](input_ids=C["ids"], attention_mask=C["mask"], out = C["model"](input_ids=C["ids"], attention_mask=C["mask"],
input_features=ai["input_features"]) input_features=ai["input_features"])
probs = torch.softmax(out.logits_per_audio.squeeze(0), dim=-1) probs = torch.softmax(out.logits_per_audio.squeeze(0), dim=-1)
i = int(probs.argmax()) fam_probs = (probs @ C["onehot"]).tolist()
return FAMS[i], float(probs[i]) return dict(zip(C["fam_keys"], fam_probs))
def classify_file(path):
"""One sample → (family, confidence) or None, per the active METHOD:
clap (fine/coarse) · panns (AudioSet) · ensemble (mean of both vectors)."""
vecs = []
if METHOD in ("clap", "ensemble"):
v = clap_vector(path)
if v:
vecs.append(v)
if METHOD in ("panns", "ensemble"):
import sample_panns
v = sample_panns.family_vector(path)
if v:
vecs.append(v)
if not vecs:
return None
keys = ONT.FAMILIES
avg = {k: sum(v.get(k, 0.0) for v in vecs) / len(vecs) for k in keys}
f = max(avg, key=avg.get)
return f, round(avg[f], 3)
def classify_folder(name): def classify_folder(name):
...@@ -220,7 +260,15 @@ def cmd_one(name): ...@@ -220,7 +260,15 @@ def cmd_one(name):
def main(): def main():
global MODE, METHOD
args = sys.argv[1:] args = sys.argv[1:]
if "--coarse" in args:
MODE = "coarse"
if "--fine" in args:
MODE = "fine"
if "--method" in args:
METHOD = args[args.index("--method") + 1]
print(f" (method={METHOD}, clap-mode={MODE})")
cmd = args[0] if args else "validate" cmd = args[0] if args else "validate"
if cmd == "validate": if cmd == "validate":
cmd_validate() cmd_validate()
......
"""Fine sample-descriptor ontology for zero-shot audio classification.
Each entry is a natural-language descriptor a contrastive audio model (CLAP) can
match an unknown one-shot against. Many sharp anchors separate better than 12 coarse
family prompts — especially the melodic cluster (keys/lead/synth/pad) and the
bass↔kick boundary. At classify time we score every descriptor, then MARGINALIZE
(sum probabilities) over the descriptors of each FAMILY → the 12-way fleet target.
This is the 'vocabulary the classifier listens with'. Extend freely — more, more
specific terms generally help. `FAMILY_OF[descriptor] = family` is the only contract.
A second, finer label per descriptor (`SUBLABEL`) is kept for future fine-grained
output (e.g. '808 kick' vs 'acoustic kick') and for PANNs/AudioSet mapping work."""
# family → list of (descriptor phrase, optional sub-label). Sub-label defaults to the
# phrase. Phrases are written to drop cleanly into "this is the sound of {phrase}".
ONTOLOGY = {
"kick": [
"a kick drum", "an 808 kick drum", "a punchy acoustic bass drum",
"a boomy four-on-the-floor kick", "a distorted hardcore kick",
"a tight techno kick", "a deep sub kick with a click",
],
"snare": [
"a snare drum", "an acoustic snare backbeat", "a tight electronic snare",
"a 909 snare", "a snappy drum-machine snare", "a rimshot",
"a single hand clap", "a layered clap stack", "a side-stick",
],
"hat": [
"a closed hi-hat tick", "an open hi-hat", "a hi-hat groove",
"a ride cymbal", "a crash cymbal", "a metallic cymbal sizzle",
"a short bright shaker tick",
],
"perc": [
"a conga hit", "a bongo drum", "a tom-tom", "a tabla", "a djembe",
"a tambourine", "a woodblock", "a cowbell", "a clave", "a guiro scrape",
"a tuned hand-percussion one-shot",
],
"break": [
"a drum break loop", "an amen breakbeat", "a chopped jungle break",
"a funky drummer loop", "a full drum loop with kick snare and hi-hats",
"a breakbeat rhythm with swing",
],
"bass": [
"a sustained sub bass note", "a deep low bass tone", "a reese bass",
"an acid 303 bassline", "a fingered bass guitar", "a wobble dubstep bass",
"a round synth bassline", "a low droning bass", "a plucked bass note",
],
"keys": [
"a grand piano chord", "an electric piano", "a rhodes chord",
"a wurlitzer keyboard", "a hammond organ", "a clean electric guitar chord",
"an acoustic guitar pluck", "a harpsichord", "a clavinet", "a marimba",
],
"lead": [
"a bright synth lead melody", "a fast synth arpeggio", "a plucky lead synth",
"a chiptune square-wave lead", "a sawtooth lead line", "a brass stab",
"a whistle-like lead melody",
],
"synth": [
"a polysynth chord stab", "an electronic synthesizer tone", "an FM synth bell",
"a detuned supersaw chord", "a retro chiptune synth", "a digital synth pad stab",
"a glassy synth texture",
],
"pad": [
"a warm synth pad", "a slow ambient drone", "an evolving sustained texture",
"a string ensemble pad", "a breathy choir pad", "an atmospheric wash",
"a cinematic drone bed",
],
"fx": [
"a riser sweep", "a downlifter", "a white-noise sweep", "an impact boom",
"a reverse cymbal swell", "a vinyl crackle texture", "a glitch effect",
"a sci-fi whoosh", "a foley sound effect", "a tape stop",
],
"vox": [
"a sung vocal phrase", "a spoken-word sample", "a chopped vocal", "a rap acapella",
"a layered choir", "a vocal scat", "a movie dialogue clip", "a crowd chant",
"a breathy vocal ad-lib",
],
}
# flattened parallel arrays + reverse map (the classify-time contract)
PROMPTS, FAMILY_OF = [], []
for _fam, _terms in ONTOLOGY.items():
for _t in _terms:
PROMPTS.append("this is the sound of " + _t)
FAMILY_OF.append(_fam)
N_TERMS = len(PROMPTS)
FAMILIES = list(ONTOLOGY)
"""PANNs (AudioSet) sample tagging → fleet-family probability vector.
A second, independent opinion to ensemble with CLAP. PANNs Cnn14 is trained on
AudioSet's 527 sound classes — many are exactly our percussion/instrument families
("Bass drum", "Snare drum", "Hi-hat", "Bass guitar", "Synthesizer", "Singing"…).
We sum the model's clip-wise probabilities over the AudioSet labels mapped to each
fleet family and normalize → a comparable 12-way vector.
Audio is decoded to 32 kHz mono (Cnn14's rate) via ffmpeg — no librosa needed here.
`family_vector(path)` returns a dict {family: prob} (None if nothing relevant fired).
The map is conservative: only confident, unambiguous AudioSet labels are wired; generic
classes ("Drum", "Music") are deliberately left out so they can't pollute the vote."""
import subprocess
import numpy as np
import sample_ontology as ONT
SR = 32000
FAMILIES = ONT.FAMILIES
# AudioSet display-name → fleet family. Exact substrings matched against panns labels.
AUDIOSET_FAMILY = {
"Bass drum": "kick",
"Snare drum": "snare", "Rimshot": "snare", "Clapping": "snare",
"Hi-hat": "hat", "Cymbal": "hat", "Crash cymbal": "hat",
"Tabla": "perc", "Tambourine": "perc", "Maraca": "perc", "Cowbell": "perc",
"Wood block": "perc", "Tom-tom": "perc", "Conga": "perc", "Marimba, xylophone": "perc",
"Bass guitar": "bass", "Double bass": "bass",
"Piano": "keys", "Electric piano": "keys", "Organ": "keys",
"Electronic organ": "keys", "Hammond organ": "keys", "Harpsichord": "keys",
"Keyboard (musical)": "keys", "Mellotron": "keys", "Guitar": "keys",
"Electric guitar": "keys", "Acoustic guitar": "keys", "Clavinet": "keys",
"Synthesizer": "synth", "Sampler": "synth", "Sine wave": "synth",
"Brass instrument": "lead", "Trumpet": "lead", "Saxophone": "lead", "Trombone": "lead",
"String section": "pad", "Orchestra": "pad", "Violin, fiddle": "pad",
"Sound effect": "fx", "Effects unit": "fx", "Whoosh, swoosh, swish": "fx",
"Noise": "fx", "White noise": "fx", "Wind": "fx",
"Scratching (performance technique)": "fx",
"Singing": "vox", "Choir": "vox", "Male singing": "vox", "Female singing": "vox",
"Rapping": "vox", "Speech": "vox", "Vocal music": "vox", "Chant": "vox",
"Mantra": "vox", "Humming": "vox", "Yodeling": "vox",
}
_M = {}
def _model():
if not _M:
from panns_inference import AudioTagging
from panns_inference.config import labels
at = AudioTagging(checkpoint_path=None, device="cpu")
# precompute label-index → family (sum many labels per family)
idx2fam = {}
for i, lab in enumerate(labels):
if lab in AUDIOSET_FAMILY:
idx2fam[i] = AUDIOSET_FAMILY[lab]
_M.update(at=at, idx2fam=idx2fam)
return _M
def load_audio(path):
try:
p = subprocess.run(
["ffmpeg", "-v", "quiet", "-i", str(path), "-ac", "1", "-ar", str(SR),
"-t", "10", "-f", "f32le", "-"], capture_output=True, timeout=30)
a = np.frombuffer(p.stdout, dtype=np.float32).copy() # writable for torch
if a.size < SR // 100:
return None
if a.size < SR: # Cnn14 needs ≥~1s of mel frames; zero-pad short one-shots
a = np.pad(a, (0, SR - a.size))
return a
except Exception:
return None
def family_vector(path):
"""{family: prob} from AudioSet tags mapped to fleet families, or None."""
a = load_audio(path)
if a is None:
return None
M = _model()
clip, _ = M["at"].inference(a[None, :]) # (1, 527)
clip = clip[0]
fam = {f: 0.0 for f in FAMILIES}
for i, f in M["idx2fam"].items():
fam[f] += float(clip[i])
tot = sum(fam.values())
if tot <= 1e-6:
return None
return {f: v / tot for f, v in fam.items()}
def classify_file(path):
"""(family, confidence) or None — the dominant mapped AudioSet family."""
v = family_vector(path)
if not v:
return None
f = max(v, key=v.get)
return f, round(v[f], 3)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment