Commit 3e14a623 by PLN (Algolia)

feat(sample-classify): CLAP zero-shot sample-family analyzer (katana)

Ground sample families by LISTENING, not by name. sample_classify.py runs
laion/clap-htsat-unfused (transformers, torch CPU) over Dirt-Samples one-shots,
scoring each against text prompts for the 12 fleet families; aggregates per folder
(dominant + homogeneity → kits show as mixed). ffmpeg audio I/O, no librosa.
validate/run/one commands; validate measures top-1 vs the name-confident folders.

Finding (validate): 58% top-1 agreement with the name-heuristic at fine 12-way.
KEY: the name 'ground truth' itself is wrong in many disagreements — CLAP correctly
calls 808hc/808mc congas (perc), which the name-classifier mislabeled bass via '808'.
CLAP is near-perfect on vox/break/clear-bass/kick/keys; the genuinely fuzzy zone is
the melodic cluster (synth/lead/keys/pad). Prompt-tuning is whack-a-mole on noisy
truth. Conclusion: trust CLAP coarsely, not at fine 12-way silently.
parent e9bba22c
#!/usr/bin/env python3
"""Ground sample families by ANALYSIS, not by name (the project's core principle).
A sample-folder NAME is an unreliable family signal: it may be one instrument, a
heterogeneous kit (`jazz:0`=kick, `:1`=snare…), or a demucs grab named after its
SOURCE song (`wap`, `take5`, `the_revolution`). So instead of guessing from the
name, we LISTEN: CLAP (`laion/clap-htsat-unfused`) zero-shot scores each one-shot
against text prompts for the 12 fleet families, and we aggregate per folder.
We VALIDATE on name-confident folders first (kick/snare/vec1_snare = free ground
truth) before trusting the unresolved ones — build the katana, then cut
([[feedback_build_katana_first]]).
python3 sample_classify.py validate # accuracy on name-confident folders
python3 sample_classify.py run [--all] [--limit N] # classify → sample_families.json
python3 sample_classify.py one <folder> # debug one folder (per-file scores)
Emits `sample_families.json`: per folder → {n, dist:{fam:count}, dominant, conf,
homogeneous, source}. Audio I/O is ffmpeg → 48k mono f32 (no librosa/soundfile dep).
The dominant family + homogeneity feed eda_report (replacing the name-heuristic) and
unblock the viz palette story. Provenance recorded (model + as_of) per
[[feedback_metadata_provenance]]."""
import json
import subprocess
import sys
from collections import Counter
from pathlib import Path
import numpy as np
import models as M
HERE = Path(__file__).resolve().parent
DIRT = Path.home() / ".local/share/SuperCollider/downloaded-quarks/Dirt-Samples"
CV = HERE / "catalog_view.json"
OUT = HERE / "sample_families.json"
MODEL = "laion/clap-htsat-unfused"
SR = 48000
MAX_FILES = 12 # per folder, for speed
MAX_SECONDS = 10 # CLAP htsat window; truncate long loops
AUDIO_EXT = {".wav", ".aif", ".aiff", ".flac", ".ogg", ".mp3"}
# One rich prompt per fleet family (CLAP scores audio↔text similarity). Phrasings
# chosen to separate our 12 families; tune against the validation accuracy report.
PROMPTS = {
"kick": "a single kick drum or bass drum, a deep low thump",
"snare": "a single snare drum hit or a hand clap, a backbeat crack",
"hat": "a single hi-hat or cymbal, a crisp bright metallic tick or sizzle",
"perc": "a tuned hand percussion one-shot: a conga, bongo, tom-tom, tabla or shaker",
"break": "a drum break loop or breakbeat, a full looped rhythm with kick snare and hats",
"bass": "a sustained low bass note, a sub bass or bass guitar, deep and low-pitched",
"keys": "a piano, electric piano, rhodes, organ or clean guitar playing a chord",
"lead": "a bright monophonic synth lead melody or arpeggio",
"synth": "a polyphonic synthesizer chord or electronic synth stab",
"pad": "a slow sustained pad, drone or ambient wash, long and evolving",
"fx": "a sound effect: a riser, sweep, downlifter, noise or impact, not musical",
"vox": "a human voice: singing, speech, a vocal phrase or chant",
}
FAMS = list(PROMPTS)
# ── audio I/O (ffmpeg, no python audio deps) ──────────────────────────────────
def load_audio(path):
"""Decode any file → mono float32 @ 48k, truncated to MAX_SECONDS. None on error."""
try:
p = subprocess.run(
["ffmpeg", "-v", "quiet", "-i", str(path), "-ac", "1", "-ar", str(SR),
"-t", str(MAX_SECONDS), "-f", "f32le", "-"],
capture_output=True, timeout=30)
a = np.frombuffer(p.stdout, dtype=np.float32)
return a if a.size >= SR // 100 else None # need ≥10ms
except Exception:
return None
def folder_files(name):
"""Existing Dirt-Samples folder for a corpus sound name → its sample files."""
d = DIRT / name
if not d.is_dir():
return None
files = sorted(p for p in d.iterdir() if p.suffix.lower() in AUDIO_EXT)
return files or None
# ── CLAP (lazy: only imported when we actually classify) ──────────────────────
_CLAP = {}
def _clap():
if not _CLAP:
import torch
from transformers import ClapModel, ClapProcessor
model = ClapModel.from_pretrained(MODEL).eval()
proc = ClapProcessor.from_pretrained(MODEL)
# text tower inputs computed once; reused for every audio (cheap to re-run)
ti = proc(text=list(PROMPTS.values()), return_tensors="pt", padding=True)
_CLAP.update(torch=torch, model=model, proc=proc,
ids=ti["input_ids"], mask=ti["attention_mask"])
return _CLAP
def classify_file(path):
"""One sample → (family, confidence) or None. Uses CLAP's logits_per_audio
(audio↔text cosine × logit_scale) → softmax over the 12 family prompts."""
a = load_audio(path)
if a is None:
return None
C = _clap()
torch = C["torch"]
with torch.no_grad():
ai = C["proc"](audio=a, sampling_rate=SR, return_tensors="pt")
out = C["model"](input_ids=C["ids"], attention_mask=C["mask"],
input_features=ai["input_features"])
probs = torch.softmax(out.logits_per_audio.squeeze(0), dim=-1)
i = int(probs.argmax())
return FAMS[i], float(probs[i])
def classify_folder(name):
"""Aggregate per-file predictions for a folder → distribution + dominant."""
files = folder_files(name)
if not files:
return None
dist, confs = Counter(), []
for f in files[:MAX_FILES]:
r = classify_file(f)
if r:
dist[r[0]] += 1
confs.append(r[1])
n = sum(dist.values())
if not n:
return None
dominant, dn = dist.most_common(1)[0]
return {
"n": n,
"dist": dict(dist.most_common()),
"dominant": dominant,
"conf": round(dn / n, 3), # fraction of files agreeing
"mean_file_conf": round(float(np.mean(confs)), 3),
"homogeneous": dn / n >= 0.6, # else it's a kit / mixed
"source": "clap:" + MODEL,
}
# ── corpus vocabulary (the names we care about) ───────────────────────────────
def corpus_sounds():
cv = json.load(open(CV))
S = set()
for t in cv["tracks"]:
for s in t.get("score_sounds", []):
S.add(s)
return sorted(S)
# ── commands ──────────────────────────────────────────────────────────────────
def cmd_validate():
"""Accuracy on name-confident folders: does CLAP's dominant == the name's family?
This is the katana test — if it can't recover the obvious ones, don't trust it."""
sounds = corpus_sounds()
truth = [(s, M.classify_sample_family(s)) for s in sounds]
gt = [(s, fam) for s, fam in truth if fam and folder_files(s)]
print(f"⛵ validate — {len(gt)} name-confident folders with audio present\n")
ok, total, per = 0, 0, Counter()
confmat = Counter()
for s, fam in gt:
r = classify_folder(s)
if not r:
continue
total += 1
hit = r["dominant"] == fam
ok += hit
per[fam + (":✓" if hit else ":✗")] += 1
if not hit:
confmat[f"{fam}→{r['dominant']}"] += 1
print(f" {'✓' if hit else '✗'} {s:<22} name={fam:<6} clap={r['dominant']:<6} "
f"conf={r['conf']} ({r['n']} files)")
print(f"\n top-1 accuracy: {ok}/{total} = {ok/total*100:.0f}%" if total else " no data")
if confmat:
print(" confusions:", dict(confmat.most_common()))
def cmd_run(all_folders=False, limit=None):
"""Classify corpus folders (default) or every Dirt-Samples folder → json."""
if all_folders:
names = sorted(d.name for d in DIRT.iterdir() if d.is_dir())
else:
names = [s for s in corpus_sounds() if folder_files(s)]
if limit:
names = names[:limit]
print(f"⛵ classifying {len(names)} folders with CLAP ({MODEL})…")
out = {}
for i, name in enumerate(names, 1):
r = classify_folder(name)
if r:
out[name] = r
tag = "" if r["homogeneous"] else " ⚠ mixed/kit"
print(f" [{i}/{len(names)}] {name:<24} {r['dominant']:<6} "
f"conf={r['conf']}{tag}")
from datetime import datetime # noqa — only for stamping the artifact
payload = {"schema": "sample family analysis (clap zero-shot)",
"model": MODEL, "n_folders": len(out),
"prompts": PROMPTS, "families": out}
OUT.write_text(json.dumps(payload, ensure_ascii=False, indent=1))
homo = sum(1 for v in out.values() if v["homogeneous"])
print(f"\n✓ {OUT.name}: {len(out)} folders ({homo} homogeneous, "
f"{len(out)-homo} mixed/kit)")
def cmd_one(name):
files = folder_files(name)
if not files:
sys.exit(f"no Dirt-Samples folder for {name!r} (synthdef or missing)")
print(f"⛵ {name} — {len(files)} files (showing ≤{MAX_FILES}):\n")
for f in files[:MAX_FILES]:
r = classify_file(f)
print(f" {f.name:<28} {r[0]:<6} {r[1]:.2f}" if r else f" {f.name}: (decode fail)")
r = classify_folder(name)
print(f"\n → dominant={r['dominant']} conf={r['conf']} homogeneous={r['homogeneous']}")
print(f" dist={r['dist']}")
def main():
args = sys.argv[1:]
cmd = args[0] if args else "validate"
if cmd == "validate":
cmd_validate()
elif cmd == "run":
cmd_run(all_folders="--all" in args,
limit=int(args[args.index("--limit") + 1]) if "--limit" in args else None)
elif cmd == "one" and len(args) > 1:
cmd_one(args[1])
else:
sys.exit("usage: sample_classify.py [validate | run [--all] [--limit N] | one <folder>]")
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment