Commit 9f15bee8 by PLN (Algolia)

fix(classifier): drop bare startswith — 2-char cues must not prefix-match

The unguarded s.startswith(m) let 2-char cues overreach: cp→cpluck→snare,
808→808hc→bass. It was redundant (the ≥3-char token-prefix clause already covers
single-token plurals like pads→pad). Removed it: short cues now fire only as exact
name/token. cpluck→keys (its string identity, not snare); 808hc/808mc→None and
defer to audio (CLAP hears conga). Regression tests added. Full suite 61 green.
parent 521127ff
......@@ -554,7 +554,11 @@ def classify_sample_family(name: str):
toks = [t for t in re.split(r"[_\-0-9]+", s) if t]
for f in SAMPLE_FAMILIES:
for m in f.match:
if s == m or s.startswith(m) or m in toks or \
# exact name, exact token, or a ≥3-char cue prefixing a token (plurals:
# pads→pad, claps→clap). NO bare s.startswith(m): a 2-char cue like "cp"
# must not prefix-match "cpluck"→snare or "808"→"808hc"→bass. Short cues
# fire only as a whole token; ambiguous names fall through to None (→ CLAP).
if s == m or m in toks or \
(len(m) >= 3 and any(t.startswith(m) for t in toks)):
return f.key
# drum-machine suffixes only on DM-style names (h2ogmhh, tr909sn) — not "shiloh"
......
......@@ -106,6 +106,19 @@ def test_classifier_does_not_overreach():
assert M.classify_sample_family(unknown) is None
def test_classifier_short_cue_no_prefix_match():
"""Cases we learned the hard way (validation): a 2-char cue must NOT prefix-match.
`cp` (clap) must not swallow `cpluck`; `808` (bass) must not swallow `808hc`/`808mc`
(which are 808 drum-machine perc, not bass) — those defer to audio (None) instead."""
c = M.classify_sample_family
assert c("cp") == "snare" # the exact cue still resolves
assert c("cpluck") != "snare" # not 'cp'+luck; its real identity is keys/string
assert c("cpluck") == "keys"
assert c("808bd") == "kick" # 808 + bd token still a kick
assert c("808hc") is None # was wrongly 'bass'; HC is ambiguous → CLAP decides
assert c("808mc") is None
def test_classifier_refuses_kit_and_source_names():
"""A folder name that doesn't encode an instrument must NOT be force-classified.
Kits (jazz:0=kick, :1=snare…), drum machines, and folders named after a SOURCE
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment