fix(classifier): refuse to guess kits + source-named folders; fix sept1 morpher

PLN-flagged chain of labeling errors, traced to the SSOT classifier: - 'jazz' was matched to BREAK, but jazz is a multisample KIT (jazz:0=kick, :1=snare/hat…). A folder name is not a reliable family signal: it may be one family, a heterogeneous kit, or a demucs grab named after a SOURCE song (wap, take5, the_revolution, xplosive, rample*). classify_sample_family now fires ONLY on names that lexically encode an instrument; everything else is None (= needs per-sample analysis). No 'kit registry' (that's name-guessing too). - removed over-reaching genre/source tokens: jazz, dnb, jungle, loop from break; drum from perc. This also FIXES jungle_pads (→pad, was break) and jungle_vocals (→vox). amen kept (amencutup genuinely is the Amen break). - tempo: strip Tidal '--' line comments before parsing cps (ton_numero's commented-out morpher no longer counts); a track with a live 'cps (range …)' is now flagged morph even when it also declares a fixed setcps. morphing=1 (Septembre 1er, 60→180), was 0. - report: + stage_tempo_by_year, sources/roadmap, recurrence gig_slugs, classified/unclassified coverage (21% of palette uses need analysis, honest). - tests: classifier refuses kit/source names; jungle_pads→pad guard. 60 green.

fix(classifier): refuse to guess kits + source-named folders; fix sept1 morpher
PLN-flagged chain of labeling errors, traced to the SSOT classifier: - 'jazz' was matched to BREAK, but jazz is a multisample KIT (jazz:0=kick, :1=snare/hat…). A folder name is not a reliable family signal: it may be one family, a heterogeneous kit, or a demucs grab named after a SOURCE song (wap, take5, the_revolution, xplosive, rample*). classify_sample_family now fires ONLY on names that lexically encode an instrument; everything else is None (= needs per-sample analysis). No 'kit registry' (that's name-guessing too). - removed over-reaching genre/source tokens: jazz, dnb, jungle, loop from break; drum from perc. This also FIXES jungle_pads (→pad, was break) and jungle_vocals (→vox). amen kept (amencutup genuinely is the Amen break). - tempo: strip Tidal '--' line comments before parsing cps (ton_numero's commented-out morpher no longer counts); a track with a live 'cps (range …)' is now flagged morph even when it also declares a fixed setcps. morphing=1 (Septembre 1er, 60→180), was 0. - report: + stage_tempo_by_year, sources/roadmap, recurrence gig_slugs, classified/unclassified coverage (21% of palette uses need analysis, honest). - tests: classifier refuses kit/source names; jungle_pads→pad guard. 60 green.
e9bba22c · PLN (Algolia) · 3f2863d4 · e9bba22c · e9bba22c · e9bba22c
Commit e9bba22c authored Jun 06, 2026 by PLN (Algolia)
5 changed files
--- a/armada/tide-table/eda_report.json
+++ b/armada/tide-table/eda_report.json
--- a/armada/tide-table/models.py
+++ b/armada/tide-table/models.py
@@ -521,9 +521,9 @@ class ColorFamily(BaseModel):
 SAMPLE_FAMILIES = [
    ColorFamily(key="kick",  label="Kick",   glyph="●", hue=25,  match=["kick", "kik", "bd", "808bd", "909", "bassdrum"]),
    ColorFamily(key="snare", label="Snare",  glyph="◆", hue=50,  match=["snare", "sn", "sd", "clap", "claps", "cp", "rim", "rs"]),
-    ColorFamily(key="perc",  label="Perc",   glyph="▴", hue=80,  match=["perc", "conga", "bongo", "tom", "clave", "shaker", "tabla", "cowbell", "drum"]),
+    ColorFamily(key="perc",  label="Perc",   glyph="▴", hue=80,  match=["perc", "conga", "bongo", "tom", "clave", "shaker", "tabla", "cowbell"]),
    ColorFamily(key="hat",   label="Hat",    glyph="✦", hue=110, match=["hat", "hh", "ho", "oh", "ch", "hihat", "cymbal", "cym", "ride", "crash"]),
-    ColorFamily(key="break", label="Break",  glyph="≈", hue=150, match=["break", "amen", "loop", "jungle", "dnb", "jazz", "breaks165", "fbreak"]),
+    ColorFamily(key="break", label="Break",  glyph="≈", hue=150, match=["break", "amen", "breaks165", "fbreak"]),
    ColorFamily(key="pad",   label="Pad",    glyph="◌", hue=180, match=["pad", "drone", "choir", "string", "ambient", "atmos", "atm", "airport", "trance"]),
    ColorFamily(key="keys",  label="Keys",   glyph="♬", hue=205, match=["key", "keys", "piano", "rhodes", "epiano", "organ", "fpiano", "qstab", "cbow", "cpluck", "clav", "marimba", "mandolin", "guitar", "organ", "forgan"]),
    ColorFamily(key="lead",  label="Lead",   glyph="♪", hue=230, match=["lead", "arp", "pluck", "stab", "blip", "saw", "square", "brass", "sax", "horn", "trump", "tromb"]),
@@ -542,12 +542,14 @@ _STRONG_CONTAINS = [("kick", "kick"), ("snare", "snare"), ("clap", "snare"),
                    ("piano", "keys"), ("rhodes", "keys"), ("vocal", "vox"),
                    ("voice", "vox"), ("dialog", "vox")]
 def classify_sample_family(name: str):
    """Canonical sample-name → family key (or None). The SSOT classifier every surface
-    should use (DRY). Token-aware: a name like `vec1_snare` or `nujazz_bass125` resolves
+    should use (DRY). It classifies ONLY by instrument tokens the NAME actually encodes
-    by its embedded instrument token, not just its prefix. Sample-world model, not a
+    (`vec1_snare`→snare, `moogBass`→bass) — a deliberately conservative sample-world
-    measured-register claim. Returns the family `key` or None when genuinely unknown."""
+    claim, NOT a measured-register one. A bare folder/kit/source name (`jazz`, `gretsch`,
+    `house`) carries no reliable instrument signal — a folder may be one family, a
+    heterogeneous kit indexed by `:n`, or a demucs grab from some track — so it returns
+    None. Resolving those is an analysis job (per-sample/index), never a name guess."""
    s = name.lower()
    toks = [t for t in re.split(r"[_\-0-9]+", s) if t]
    for f in SAMPLE_FAMILIES:

--- a/armada/tide-table/tests/test_tokens.py
+++ b/armada/tide-table/tests/test_tokens.py
@@ -88,7 +88,6 @@ def test_sample_classifier_examples():
    assert c("808bd") == "kick"
    assert c("hats") == "hat"
    assert c("meth_bass") == "bass"
-    assert c("jazz") == "break"
    assert c("piano") == "keys"
    # token-aware / drum-machine / embedded (the coverage-improving cases)
    assert c("h2ogmhh") == "hat"          # DM suffix, gated by 'gm'
@@ -107,6 +106,20 @@ def test_classifier_does_not_overreach():
        assert M.classify_sample_family(unknown) is None
+def test_classifier_refuses_kit_and_source_names():
+    """A folder name that doesn't encode an instrument must NOT be force-classified.
+    Kits (jazz:0=kick, :1=snare…), drum machines, and folders named after a SOURCE
+    song carry no reliable family signal — only per-sample analysis can label them."""
+    c = M.classify_sample_family
+    for kit in ("jazz", "gretsch", "drum", "drumtraks", "techno"):   # multisample kits
+        assert c(kit) is None, kit
+    for src in ("wap", "take5", "the_revolution", "xplosive", "rampleS13"):  # sampled
+        assert c(src) is None, src
+    # but a name that DOES encode the instrument still resolves
+    assert c("jungle_breaks") == "break"
+    assert c("jungle_pads") == "pad"      # was wrongly 'break' via the genre token
 def test_style_normalization():
    assert M.norm_style("nu-jazz") == M.norm_style("nujazz") == "nujazz"
    assert M.norm_style("breakbeat") == M.norm_style("breaks") == "breaks"

--- a/armada/tide-table/tide_eda.py
+++ b/armada/tide-table/tide_eda.py
@@ -57,6 +57,9 @@ def parse_tempo(src):
    """Return {bpm, lo, hi, morph} from a .tidal source, or None. bpm = primary tempo."""
    if not src:
        return None
+    # drop Tidal line comments (`-- …`) so commented-out experiments don't count
+    # (e.g. ton_numero has a `-- # cps (range …)` morpher that was never live)
+    src = re.sub(r"--[^\n]*", "", src)
    # all setcps / cps / cpsbus / "# cps" statements
    cands = []
    for m in re.finditer(r"(?:setcps|cpsbus\s*\d+|#\s*cps|\bcps)\s*\(?(.+)", src):
@@ -75,9 +78,17 @@ def parse_tempo(src):
            cands.append({"bpm": float(bare.group(1)) * 240, "lo": None, "hi": None, "morph": False})
    if not cands:
        return None
-    # prefer a fixed setcps; fall back to first morphing range
+    # primary BPM = the fixed setcps (the declared base tempo) if there is one,
+    # else the first range's midpoint. But a track that ALSO has a `cps (range …)`
+    # morpher is a morpher (sept1 declares 120 then sweeps 60→180): keep the base
+    # bpm for sorting, flag the morph, and carry its span.
    fixed = [c for c in cands if not c["morph"]]
-    return (fixed or cands)[0]
+    morphs = [c for c in cands if c["morph"]]
+    out = dict((fixed or cands)[0])
+    if morphs:
+        out["morph"] = True
+        out["lo"], out["hi"] = morphs[0]["lo"], morphs[0]["hi"]
+    return out
 # ── sample-family classifier (DRY: the canonical SSOT classifier in models.py) ─
@@ -303,7 +314,8 @@ def build():
    # recurrence
    recurrence = sorted(({"name": t["name"], "gigs": len(t["gigs"]),
-                          "track": t["track"]} for t in T),
+                          "track": t["track"], "gig_slugs": sorted(t.get("gigs", []))}
+                         for t in T),
                        key=lambda x: -x["gigs"])
    # collab dimension (live/collab/<who>/…)
@@ -336,6 +348,11 @@ def build():
        "styles": dict(sty.most_common()),
        "palette_top": dict(snd.most_common(25)),
        "families": dict(fam.most_common()),
+        # honest coverage: a sample-NAME only encodes an instrument sometimes; kits,
+        # drum machines and source-named chops (wap, take5…) need analysis, not a guess.
+        "classified_uses": sum(fam.values()),
+        "unclassified_uses": sum(unclassified.values()),
+        "unclassified_names": len(unclassified),
        "unclassified_top": dict(unclassified.most_common(15)),
        "idioms_top": [{"norm": p["norm"], "n_tracks": p["n_tracks"]} for p in shared[:15]],
        "idioms_counts": {"shared": len(shared), "repeated": len(repeated),
@@ -343,6 +360,18 @@ def build():
        "recurrence_top": recurrence[:20],
        "collab": dict(collab.most_common()),
        "vocabulary_growth": dict(sorted(vocabulary_growth().items())),
+        # which inputs fed this build (shown in the viz footer; append as we add more)
+        "sources": [
+            {"key": "score", "label": ".tidal scores",
+             "detail": "tempo, sample palette, phrases"},
+            {"key": "site", "label": "site gig metadata",
+             "detail": "style tags, bpm, setlists"},
+            {"key": "git", "label": "git history", "detail": "track creation dates"},
+            {"key": "dirt", "label": "Dirt-Samples links",
+             "detail": "sample-folder import dates"},
+        ],
+        # near-future sources (kept honest: shown as "coming", not faked)
+        "roadmap": ["platform play counts", "git edit counts", "per-set durations"],
    }
    return report

--- a/armada/tide-table/tokens.json
+++ b/armada/tide-table/tokens.json
@@ -228,8 +228,7 @@
    "clave",
    "shaker",
    "tabla",
-    "cowbell",
+    "cowbell"
-    "drum"
   ]
  },
  {
@@ -278,10 +277,6 @@
   "match": [
    "break",
    "amen",
-    "loop",
-    "jungle",
-    "dnb",
-    "jazz",
    "breaks165",
    "fbreak"
   ]