Commit 3ce8bba8 by PLN (Algolia)

feat(sample-resolve): layered family resolver → grounded sample_families.json

L1 filename (sample_meta) → L3 audio fallback (sample_classify ensemble-fine, the
73% scoreboard winner) → L2 folder cross-check (flags folder_agrees=False when the
folder name lies). Audio runs ONLY on opaque files. Per folder: dominant, agreement
conf, homogeneous, kit_like, per-:index labels with source, provenance. Validates
the domain knowledge mechanically: jazz=kit (000_BD→kick :0, HH/OH→hat, SN→snare),
cpluck=multisample (plucks+body, folder='keys' flagged), 808hc=perc via audio.
parent bdfdf5fc
#!/usr/bin/env python3
"""Layered sample-family resolver → sample_families.json (the grounded palette).
Combines three signals per file, highest-precision first, each tagged with where it
came from (provenance), spending audio compute ONLY where the free signals are silent:
L1 filename (sample_meta) — an instrument token in the filename → family. Gold
where descriptive ('1 KICK LOW1'); silent on opaque names ('JUPI').
L3 audio (sample_classify, ensemble-fine by default) — the fallback for opaque
files, and the only per-:index resolver for source-named kits.
L2 folder (models.classify_sample_family) — a folder-level cross-check: if it
fires and DISAGREES with the resolved dominant, we flag it (the folder
name lied — cpluck/808hc — or the kit is heterogeneous).
A folder name is NEVER trusted to override per-file evidence — that's the whole lesson
(the jazz-is-a-kit correction, [[feedback_mastering_eda]]). Per folder we emit the
dominant family, agreement conf, homogeneity, kit_like, the per-:index labels with
their source, and provenance. Audio runs only on opaque files → cheap.
python3 sample_resolve.py one <folder>
python3 sample_resolve.py run [--all] [--limit N] [--method clap|ensemble] [--coarse]
"""
import json
import sys
from collections import Counter
from datetime import date
from pathlib import Path
import models as M
import sample_classify as CLF
import sample_meta as META
HERE = Path(__file__).resolve().parent
OUT = HERE / "sample_families.json"
def resolve_file(path, do_audio=True):
"""One file → (family, source). source ∈ {filename, audio, None}. L1 then L3."""
info = META.parse_name(path.stem)
if info["family"]:
return info["family"], "filename"
if do_audio:
r = CLF.classify_file(path) # honours CLF.METHOD / CLF.MODE
if r:
return r[0], "audio"
return None, None
def resolve_folder(name, do_audio=True):
files = META.folder_files(name)
if not files:
return None
files = files[:CLF.MAX_FILES]
per, dist, srcs = [], Counter(), Counter()
for i, f in enumerate(files):
fam, src = resolve_file(f, do_audio=do_audio)
per.append({"index": i, "name": f.stem, "family": fam, "source": src})
if fam:
dist[fam] += 1
srcs[src] += 1
n = sum(dist.values())
if not n:
return None
dominant, dn = dist.most_common(1)[0]
folder_fam = M.classify_sample_family(name) # L2 cross-check
label_source = ("filename" if not srcs.get("audio") else
"audio" if not srcs.get("filename") else "mixed")
return {
"n": len(files),
"n_resolved": n,
"by_family": dict(dist.most_common()),
"dominant": dominant,
"conf": round(dn / n, 3), # fraction agreeing
"homogeneous": dn / n >= 0.6, # else a kit / mixed
"kit_like": len(dist) >= 2,
"label_source": label_source, # filename | audio | mixed
"folder_name_family": folder_fam, # L2 (or None)
"folder_agrees": folder_fam is None or folder_fam == dominant,
"per_index": per,
}
def _corpus_sounds():
cv = json.load(open(HERE / "catalog_view.json"))
return sorted({s for t in cv["tracks"] for s in t.get("score_sounds", [])})
def cmd_run(all_folders=False, limit=None):
names = (sorted(d.name for d in META.DIRT.iterdir() if d.is_dir())
if all_folders else [s for s in _corpus_sounds() if META.folder_files(s)])
if limit:
names = names[:limit]
method = f"{CLF.METHOD}:{CLF.MODE}"
print(f"⛵ resolving {len(names)} folders — L1 filename → L3 audio ({method})\n")
out, n_audio_folders, flags = {}, 0, 0
for i, name in enumerate(names, 1):
r = resolve_folder(name)
if not r:
continue
out[name] = r
if r["label_source"] in ("audio", "mixed"):
n_audio_folders += 1
if not r["folder_agrees"]:
flags += 1
tag = "" if r["folder_agrees"] else f" ⚠ folder='{r['folder_name_family']}'≠{r['dominant']}"
kit = " kit" if r["kit_like"] and not r["homogeneous"] else ""
print(f" [{i}/{len(names)}] {name:<24} {r['dominant']:<6} conf={r['conf']} "
f"src={r['label_source']}{kit}{tag}", flush=True)
prov = M.Provenance(source=M.Source.derived,
locator=f"sample_resolve: L1 filename + L3 {method}",
as_of=date.today()).model_dump(mode="json")
homo = sum(1 for v in out.values() if v["homogeneous"])
kits = sum(1 for v in out.values() if v["kit_like"] and not v["homogeneous"])
payload = {"schema": "grounded sample families (L1 filename + L3 audio)",
"method": method, "n_folders": len(out), "provenance": prov,
"families": out}
OUT.write_text(json.dumps(payload, ensure_ascii=False, indent=1))
print(f"\n✓ {OUT.name}: {len(out)} folders — {homo} homogeneous, {kits} kit-like, "
f"{flags} folder-name disagreements flagged")
def cmd_one(name):
r = resolve_folder(name)
if not r:
print(f"no folder/files for {name!r}")
return
print(f"⛵ {name} — dominant={r['dominant']} conf={r['conf']} src={r['label_source']} "
f"kit_like={r['kit_like']} folder_agrees={r['folder_agrees']} "
f"(L2='{r['folder_name_family']}')\n")
for x in r["per_index"][:24]:
print(f" :{x['index']:<3} {(x['family'] or '·'):<7} {(x['source'] or ''):<9} {x['name']}")
def main():
args = sys.argv[1:]
if "--coarse" in args:
CLF.MODE = "coarse"
if "--method" in args:
CLF.METHOD = args[args.index("--method") + 1]
else:
CLF.METHOD = "ensemble" # best on the scoreboard (73%)
print(f" (audio method={CLF.METHOD}, mode={CLF.MODE})")
if args and args[0] == "one" and len(args) > 1:
cmd_one(args[1])
elif args and args[0] == "run":
cmd_run(all_folders="--all" in args,
limit=int(args[args.index("--limit") + 1]) if "--limit" in args else None)
else:
sys.exit("usage: sample_resolve.py [one <folder> | run [--all] [--limit N] "
"[--method clap|ensemble] [--coarse]]")
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment