Commit e9bba22c by PLN (Algolia)

fix(classifier): refuse to guess kits + source-named folders; fix sept1 morpher

PLN-flagged chain of labeling errors, traced to the SSOT classifier:
- 'jazz' was matched to BREAK, but jazz is a multisample KIT (jazz:0=kick,
  :1=snare/hat…). A folder name is not a reliable family signal: it may be one
  family, a heterogeneous kit, or a demucs grab named after a SOURCE song
  (wap, take5, the_revolution, xplosive, rample*). classify_sample_family now
  fires ONLY on names that lexically encode an instrument; everything else is
  None (= needs per-sample analysis). No 'kit registry' (that's name-guessing too).
- removed over-reaching genre/source tokens: jazz, dnb, jungle, loop from break;
  drum from perc. This also FIXES jungle_pads (→pad, was break) and
  jungle_vocals (→vox). amen kept (amencutup genuinely is the Amen break).
- tempo: strip Tidal '--' line comments before parsing cps (ton_numero's
  commented-out morpher no longer counts); a track with a live 'cps (range …)'
  is now flagged morph even when it also declares a fixed setcps. morphing=1
  (Septembre 1er, 60→180), was 0.
- report: + stage_tempo_by_year, sources/roadmap, recurrence gig_slugs,
  classified/unclassified coverage (21% of palette uses need analysis, honest).
- tests: classifier refuses kit/source names; jungle_pads→pad guard. 60 green.
parent 3f2863d4
...@@ -521,9 +521,9 @@ class ColorFamily(BaseModel): ...@@ -521,9 +521,9 @@ class ColorFamily(BaseModel):
SAMPLE_FAMILIES = [ SAMPLE_FAMILIES = [
ColorFamily(key="kick", label="Kick", glyph="●", hue=25, match=["kick", "kik", "bd", "808bd", "909", "bassdrum"]), ColorFamily(key="kick", label="Kick", glyph="●", hue=25, match=["kick", "kik", "bd", "808bd", "909", "bassdrum"]),
ColorFamily(key="snare", label="Snare", glyph="◆", hue=50, match=["snare", "sn", "sd", "clap", "claps", "cp", "rim", "rs"]), ColorFamily(key="snare", label="Snare", glyph="◆", hue=50, match=["snare", "sn", "sd", "clap", "claps", "cp", "rim", "rs"]),
ColorFamily(key="perc", label="Perc", glyph="▴", hue=80, match=["perc", "conga", "bongo", "tom", "clave", "shaker", "tabla", "cowbell", "drum"]), ColorFamily(key="perc", label="Perc", glyph="▴", hue=80, match=["perc", "conga", "bongo", "tom", "clave", "shaker", "tabla", "cowbell"]),
ColorFamily(key="hat", label="Hat", glyph="✦", hue=110, match=["hat", "hh", "ho", "oh", "ch", "hihat", "cymbal", "cym", "ride", "crash"]), ColorFamily(key="hat", label="Hat", glyph="✦", hue=110, match=["hat", "hh", "ho", "oh", "ch", "hihat", "cymbal", "cym", "ride", "crash"]),
ColorFamily(key="break", label="Break", glyph="≈", hue=150, match=["break", "amen", "loop", "jungle", "dnb", "jazz", "breaks165", "fbreak"]), ColorFamily(key="break", label="Break", glyph="≈", hue=150, match=["break", "amen", "breaks165", "fbreak"]),
ColorFamily(key="pad", label="Pad", glyph="◌", hue=180, match=["pad", "drone", "choir", "string", "ambient", "atmos", "atm", "airport", "trance"]), ColorFamily(key="pad", label="Pad", glyph="◌", hue=180, match=["pad", "drone", "choir", "string", "ambient", "atmos", "atm", "airport", "trance"]),
ColorFamily(key="keys", label="Keys", glyph="♬", hue=205, match=["key", "keys", "piano", "rhodes", "epiano", "organ", "fpiano", "qstab", "cbow", "cpluck", "clav", "marimba", "mandolin", "guitar", "organ", "forgan"]), ColorFamily(key="keys", label="Keys", glyph="♬", hue=205, match=["key", "keys", "piano", "rhodes", "epiano", "organ", "fpiano", "qstab", "cbow", "cpluck", "clav", "marimba", "mandolin", "guitar", "organ", "forgan"]),
ColorFamily(key="lead", label="Lead", glyph="♪", hue=230, match=["lead", "arp", "pluck", "stab", "blip", "saw", "square", "brass", "sax", "horn", "trump", "tromb"]), ColorFamily(key="lead", label="Lead", glyph="♪", hue=230, match=["lead", "arp", "pluck", "stab", "blip", "saw", "square", "brass", "sax", "horn", "trump", "tromb"]),
...@@ -542,12 +542,14 @@ _STRONG_CONTAINS = [("kick", "kick"), ("snare", "snare"), ("clap", "snare"), ...@@ -542,12 +542,14 @@ _STRONG_CONTAINS = [("kick", "kick"), ("snare", "snare"), ("clap", "snare"),
("piano", "keys"), ("rhodes", "keys"), ("vocal", "vox"), ("piano", "keys"), ("rhodes", "keys"), ("vocal", "vox"),
("voice", "vox"), ("dialog", "vox")] ("voice", "vox"), ("dialog", "vox")]
def classify_sample_family(name: str): def classify_sample_family(name: str):
"""Canonical sample-name → family key (or None). The SSOT classifier every surface """Canonical sample-name → family key (or None). The SSOT classifier every surface
should use (DRY). Token-aware: a name like `vec1_snare` or `nujazz_bass125` resolves should use (DRY). It classifies ONLY by instrument tokens the NAME actually encodes
by its embedded instrument token, not just its prefix. Sample-world model, not a (`vec1_snare`→snare, `moogBass`→bass) — a deliberately conservative sample-world
measured-register claim. Returns the family `key` or None when genuinely unknown.""" claim, NOT a measured-register one. A bare folder/kit/source name (`jazz`, `gretsch`,
`house`) carries no reliable instrument signal — a folder may be one family, a
heterogeneous kit indexed by `:n`, or a demucs grab from some track — so it returns
None. Resolving those is an analysis job (per-sample/index), never a name guess."""
s = name.lower() s = name.lower()
toks = [t for t in re.split(r"[_\-0-9]+", s) if t] toks = [t for t in re.split(r"[_\-0-9]+", s) if t]
for f in SAMPLE_FAMILIES: for f in SAMPLE_FAMILIES:
......
...@@ -88,7 +88,6 @@ def test_sample_classifier_examples(): ...@@ -88,7 +88,6 @@ def test_sample_classifier_examples():
assert c("808bd") == "kick" assert c("808bd") == "kick"
assert c("hats") == "hat" assert c("hats") == "hat"
assert c("meth_bass") == "bass" assert c("meth_bass") == "bass"
assert c("jazz") == "break"
assert c("piano") == "keys" assert c("piano") == "keys"
# token-aware / drum-machine / embedded (the coverage-improving cases) # token-aware / drum-machine / embedded (the coverage-improving cases)
assert c("h2ogmhh") == "hat" # DM suffix, gated by 'gm' assert c("h2ogmhh") == "hat" # DM suffix, gated by 'gm'
...@@ -107,6 +106,20 @@ def test_classifier_does_not_overreach(): ...@@ -107,6 +106,20 @@ def test_classifier_does_not_overreach():
assert M.classify_sample_family(unknown) is None assert M.classify_sample_family(unknown) is None
def test_classifier_refuses_kit_and_source_names():
"""A folder name that doesn't encode an instrument must NOT be force-classified.
Kits (jazz:0=kick, :1=snare…), drum machines, and folders named after a SOURCE
song carry no reliable family signal — only per-sample analysis can label them."""
c = M.classify_sample_family
for kit in ("jazz", "gretsch", "drum", "drumtraks", "techno"): # multisample kits
assert c(kit) is None, kit
for src in ("wap", "take5", "the_revolution", "xplosive", "rampleS13"): # sampled
assert c(src) is None, src
# but a name that DOES encode the instrument still resolves
assert c("jungle_breaks") == "break"
assert c("jungle_pads") == "pad" # was wrongly 'break' via the genre token
def test_style_normalization(): def test_style_normalization():
assert M.norm_style("nu-jazz") == M.norm_style("nujazz") == "nujazz" assert M.norm_style("nu-jazz") == M.norm_style("nujazz") == "nujazz"
assert M.norm_style("breakbeat") == M.norm_style("breaks") == "breaks" assert M.norm_style("breakbeat") == M.norm_style("breaks") == "breaks"
......
...@@ -57,6 +57,9 @@ def parse_tempo(src): ...@@ -57,6 +57,9 @@ def parse_tempo(src):
"""Return {bpm, lo, hi, morph} from a .tidal source, or None. bpm = primary tempo.""" """Return {bpm, lo, hi, morph} from a .tidal source, or None. bpm = primary tempo."""
if not src: if not src:
return None return None
# drop Tidal line comments (`-- …`) so commented-out experiments don't count
# (e.g. ton_numero has a `-- # cps (range …)` morpher that was never live)
src = re.sub(r"--[^\n]*", "", src)
# all setcps / cps / cpsbus / "# cps" statements # all setcps / cps / cpsbus / "# cps" statements
cands = [] cands = []
for m in re.finditer(r"(?:setcps|cpsbus\s*\d+|#\s*cps|\bcps)\s*\(?(.+)", src): for m in re.finditer(r"(?:setcps|cpsbus\s*\d+|#\s*cps|\bcps)\s*\(?(.+)", src):
...@@ -75,9 +78,17 @@ def parse_tempo(src): ...@@ -75,9 +78,17 @@ def parse_tempo(src):
cands.append({"bpm": float(bare.group(1)) * 240, "lo": None, "hi": None, "morph": False}) cands.append({"bpm": float(bare.group(1)) * 240, "lo": None, "hi": None, "morph": False})
if not cands: if not cands:
return None return None
# prefer a fixed setcps; fall back to first morphing range # primary BPM = the fixed setcps (the declared base tempo) if there is one,
# else the first range's midpoint. But a track that ALSO has a `cps (range …)`
# morpher is a morpher (sept1 declares 120 then sweeps 60→180): keep the base
# bpm for sorting, flag the morph, and carry its span.
fixed = [c for c in cands if not c["morph"]] fixed = [c for c in cands if not c["morph"]]
return (fixed or cands)[0] morphs = [c for c in cands if c["morph"]]
out = dict((fixed or cands)[0])
if morphs:
out["morph"] = True
out["lo"], out["hi"] = morphs[0]["lo"], morphs[0]["hi"]
return out
# ── sample-family classifier (DRY: the canonical SSOT classifier in models.py) ─ # ── sample-family classifier (DRY: the canonical SSOT classifier in models.py) ─
...@@ -303,7 +314,8 @@ def build(): ...@@ -303,7 +314,8 @@ def build():
# recurrence # recurrence
recurrence = sorted(({"name": t["name"], "gigs": len(t["gigs"]), recurrence = sorted(({"name": t["name"], "gigs": len(t["gigs"]),
"track": t["track"]} for t in T), "track": t["track"], "gig_slugs": sorted(t.get("gigs", []))}
for t in T),
key=lambda x: -x["gigs"]) key=lambda x: -x["gigs"])
# collab dimension (live/collab/<who>/…) # collab dimension (live/collab/<who>/…)
...@@ -336,6 +348,11 @@ def build(): ...@@ -336,6 +348,11 @@ def build():
"styles": dict(sty.most_common()), "styles": dict(sty.most_common()),
"palette_top": dict(snd.most_common(25)), "palette_top": dict(snd.most_common(25)),
"families": dict(fam.most_common()), "families": dict(fam.most_common()),
# honest coverage: a sample-NAME only encodes an instrument sometimes; kits,
# drum machines and source-named chops (wap, take5…) need analysis, not a guess.
"classified_uses": sum(fam.values()),
"unclassified_uses": sum(unclassified.values()),
"unclassified_names": len(unclassified),
"unclassified_top": dict(unclassified.most_common(15)), "unclassified_top": dict(unclassified.most_common(15)),
"idioms_top": [{"norm": p["norm"], "n_tracks": p["n_tracks"]} for p in shared[:15]], "idioms_top": [{"norm": p["norm"], "n_tracks": p["n_tracks"]} for p in shared[:15]],
"idioms_counts": {"shared": len(shared), "repeated": len(repeated), "idioms_counts": {"shared": len(shared), "repeated": len(repeated),
...@@ -343,6 +360,18 @@ def build(): ...@@ -343,6 +360,18 @@ def build():
"recurrence_top": recurrence[:20], "recurrence_top": recurrence[:20],
"collab": dict(collab.most_common()), "collab": dict(collab.most_common()),
"vocabulary_growth": dict(sorted(vocabulary_growth().items())), "vocabulary_growth": dict(sorted(vocabulary_growth().items())),
# which inputs fed this build (shown in the viz footer; append as we add more)
"sources": [
{"key": "score", "label": ".tidal scores",
"detail": "tempo, sample palette, phrases"},
{"key": "site", "label": "site gig metadata",
"detail": "style tags, bpm, setlists"},
{"key": "git", "label": "git history", "detail": "track creation dates"},
{"key": "dirt", "label": "Dirt-Samples links",
"detail": "sample-folder import dates"},
],
# near-future sources (kept honest: shown as "coming", not faked)
"roadmap": ["platform play counts", "git edit counts", "per-set durations"],
} }
return report return report
......
...@@ -228,8 +228,7 @@ ...@@ -228,8 +228,7 @@
"clave", "clave",
"shaker", "shaker",
"tabla", "tabla",
"cowbell", "cowbell"
"drum"
] ]
}, },
{ {
...@@ -278,10 +277,6 @@ ...@@ -278,10 +277,6 @@
"match": [ "match": [
"break", "break",
"amen", "amen",
"loop",
"jungle",
"dnb",
"jazz",
"breaks165", "breaks165",
"fbreak" "fbreak"
] ]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment