Commit 521127ff by PLN (Algolia)

feat(sample-meta): L1 filename-metadata harvester (free per-file signal)

RIFF chunk dump proved samples carry NO semantic embedded metadata (only encoder
tags) — the Pulsar browser shows FILENAMES. So harvest the filename: leading pad
index + instrument-token lexicon → fleet family + source hint. Conservative: opaque
names (JUPI, doing_it_right, 808hc's 'HC') stay family=None → fall back to audio.
Detects kit-like folders (≥2 families by name), the 'jazz is a kit' case.
Corpus coverage: 49% folders / 31% files named, 36 kit-like folders.
parent acf6a7c1
"""Harvest every FREE label a sample file carries — then hand the rest to audio.
Empirically (a RIFF chunk-table dump over rample/daft/bd WAVs) our samples carry
NO semantic *embedded* metadata: only encoder/software tags (`LIST/INFO` ISFT=Lavf,
an `id3` software frame). The Pulsar sample browser's descriptive labels
(`rampleA0:0 [1 KICK LOW1]`) are the FILENAMES, not chunk data. So the one free
per-file signal is the filename — and it's a MIXED bag:
descriptive '1 KICK LOW1.wav' → instrument label (gold)
mnemonic '1 JUPI.wav' → pack codename (opaque; needs audio)
source/lyric '01_doing_it_right1.wav' → provenance (remix-DNA), not a family
A filename carries up to three orthogonal facts: a leading pad/slot index, an
instrument hint, a source hint. We parse all three (with provenance), map the
instrument hint to a fleet family via a conservative token lexicon, and leave
`family=None` when the name is opaque — that None is the signal to fall back to
`sample_classify` (audio). This is L1 of the layered resolver:
L1 filename (this module, free, high-precision where it fires)
L2 folder (models.classify_sample_family — instrument folder names)
L3 audio (sample_classify CLAP/PANNs/ensemble — opaque names + validator)
Used for: per-FILE ground truth to validate L3 (filenames beat folder names, which
lie — 808hc=conga), per-`:index` resolution, and source-hint for the remix story.
"""
import re
from pathlib import Path
import sample_ontology as ONT
DIRT = Path.home() / ".local/share/SuperCollider/downloaded-quarks/Dirt-Samples"
AUDIO_EXT = {".wav", ".aif", ".aiff", ".flac", ".ogg", ".mp3"}
# Instrument-token → fleet family. CONSERVATIVE (the project principle: don't guess).
# Tokens are matched against UPPERCASED, word-split filename tokens. Order in the
# resolver below is by family PRIORITY when a name carries several (drums win over
# texture: '2 VOX HEART' → vox, but 'KICK LAYER' → kick). Keep only unambiguous
# instrument words; register words (LOW/HIGH/SUB) are modifiers, not families.
TOKEN_FAMILY = {
"kick": ["KICK", "KIK", "BD", "BASSDRUM", "BDRUM", "KCK"],
"snare": ["SNARE", "SNR", "SN", "SD", "RIM", "RIMSHOT", "CLAP", "CLP", "SNAP", "HANDCLAP"],
"hat": ["HH", "HAT", "HIHAT", "HI-HAT", "OH", "CH", "OPENHAT", "CLOSEDHAT",
"CYMBAL", "CYM", "RIDE", "CRASH"],
"perc": ["PERC", "CONGA", "BONGO", "TOM", "TABLA", "SHAKER", "SHKR", "COWBELL",
"CLAVE", "WOODBLOCK", "TAMB", "TAMBOURINE", "TIMBALE", "DJEMBE", "CABASA"],
"bass": ["BASS", "SUB", "808", "BASSLINE", "REESE", "WOBBLE"],
"keys": ["PIANO", "RHODES", "EP", "EPIANO", "ORGAN", "CLAV", "CLAVINET",
"KEYS", "KEY", "WURLI", "HARPSI"],
"lead": ["LEAD", "ARP", "ARPEGGIO", "ACID", "SOLO"],
"synth": ["SYNTH", "SYN", "SAW", "SQUARE", "STAB", "PLUCK", "BLEEP", "BLIP"],
"pad": ["PAD", "STRINGS", "STRING", "CHORD", "DRONE", "ATMOS", "ATMOSPHERE",
"BRASS", "FLUTE", "CHOIR"],
"fx": ["FX", "SFX", "NOISE", "RISER", "SWEEP", "IMPACT", "DOWNLIFTER", "UPLIFTER",
"WHOOSH", "GLITCH", "ZAP", "LASER", "RvS", "REVERSE", "FOLEY"],
"vox": ["VOX", "VOICE", "VOC", "VOCAL", "CHANT", "SING", "AAH", "OOH",
"ACAPELLA", "SPEECH", "TALK"],
}
# resolver priority: percussive identity is the most reliable filename signal
PRIORITY = ["kick", "snare", "hat", "perc", "vox", "bass", "lead", "synth", "keys", "pad", "fx"]
_LUT = {tok: fam for fam, toks in TOKEN_FAMILY.items() for tok in toks}
_PAD_RE = re.compile(r"^\s*(\d{1,3})[\s_\-.]+") # leading slot index: '1 ', '01_', '3-'
_WORD_RE = re.compile(r"[A-Za-z0-9]+")
def parse_name(stem):
"""Filename stem → {pad, tokens, family, matched, opaque}.
pad: leading slot int or None. family: fleet family or None (opaque).
matched: the token that decided the family. opaque: no instrument token found."""
pad = None
m = _PAD_RE.match(stem)
body = stem
if m:
pad = int(m.group(1))
body = stem[m.end():]
toks = [t.upper() for t in _WORD_RE.findall(body)]
fam, matched = None, None
by_fam = {_LUT[t]: t for t in toks if t in _LUT} # last token per family
for f in PRIORITY:
if f in by_fam:
fam, matched = f, by_fam[f]
break
return {"pad": pad, "tokens": toks, "family": fam,
"matched": matched, "opaque": fam is None}
def folder_files(name):
d = DIRT / name
if not d.is_dir():
return None
fs = sorted(p for p in d.iterdir() if p.suffix.lower() in AUDIO_EXT)
return fs or None
def folder_meta(name):
"""Per-folder filename harvest → per-index families + folder-level summary.
homogeneous: ≥60% of NAMED files agree (mirrors sample_classify). kit_like: the
folder carries ≥2 distinct named families (jazz:0=kick, :1=snare → a kit)."""
files = folder_files(name)
if not files:
return None
per = []
fams = {}
for i, f in enumerate(files):
info = parse_name(f.stem)
per.append({"index": i, "name": f.stem, **info})
if info["family"]:
fams[info["family"]] = fams.get(info["family"], 0) + 1
named = sum(fams.values())
dominant = max(fams, key=fams.get) if fams else None
return {
"n": len(files),
"n_named": named,
"by_family": dict(sorted(fams.items(), key=lambda kv: -kv[1])),
"dominant": dominant,
"homogeneous": bool(dominant) and fams[dominant] / named >= 0.6 if named else False,
"kit_like": len(fams) >= 2,
"coverage": round(named / len(files), 3),
"per_index": per,
"source": "filename",
}
# ── quick coverage report over the corpus vocabulary ──────────────────────────
def _corpus_sounds():
import json
cv = json.load(open(Path(__file__).resolve().parent / "catalog_view.json"))
return sorted({s for t in cv["tracks"] for s in t.get("score_sounds", [])})
def cmd_report():
sounds = [s for s in _corpus_sounds() if folder_files(s)]
print(f"⛵ filename-metadata coverage over {len(sounds)} corpus folders with audio\n")
any_named = files_total = files_named = kits = 0
for s in sounds:
m = folder_meta(s)
files_total += m["n"]
files_named += m["n_named"]
if m["n_named"]:
any_named += 1
if m["kit_like"]:
kits += 1
print(f" folders with ANY named file : {any_named}/{len(sounds)} "
f"({any_named/len(sounds)*100:.0f}%)")
print(f" files with an instrument tok: {files_named}/{files_total} "
f"({files_named/files_total*100:.0f}%)")
print(f" kit-like folders (≥2 fams) : {kits}")
def cmd_one(name):
m = folder_meta(name)
if not m:
print(f"no folder/files for {name!r}")
return
print(f"⛵ {name} — {m['n']} files, coverage={m['coverage']} "
f"dominant={m['dominant']} kit_like={m['kit_like']}\n")
for r in m["per_index"][:24]:
fam = r["family"] or "·opaque"
print(f" :{r['index']:<3} pad={str(r['pad']):<4} {fam:<8} "
f"{('<'+r['matched']+'>') if r['matched'] else '':<14} {r['name']}")
if __name__ == "__main__":
import sys
if len(sys.argv) > 2 and sys.argv[1] == "one":
cmd_one(sys.argv[2])
else:
cmd_report()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment