Commit e9bba22c by PLN (Algolia)

fix(classifier): refuse to guess kits + source-named folders; fix sept1 morpher

PLN-flagged chain of labeling errors, traced to the SSOT classifier:
- 'jazz' was matched to BREAK, but jazz is a multisample KIT (jazz:0=kick,
  :1=snare/hat…). A folder name is not a reliable family signal: it may be one
  family, a heterogeneous kit, or a demucs grab named after a SOURCE song
  (wap, take5, the_revolution, xplosive, rample*). classify_sample_family now
  fires ONLY on names that lexically encode an instrument; everything else is
  None (= needs per-sample analysis). No 'kit registry' (that's name-guessing too).
- removed over-reaching genre/source tokens: jazz, dnb, jungle, loop from break;
  drum from perc. This also FIXES jungle_pads (→pad, was break) and
  jungle_vocals (→vox). amen kept (amencutup genuinely is the Amen break).
- tempo: strip Tidal '--' line comments before parsing cps (ton_numero's
  commented-out morpher no longer counts); a track with a live 'cps (range …)'
  is now flagged morph even when it also declares a fixed setcps. morphing=1
  (Septembre 1er, 60→180), was 0.
- report: + stage_tempo_by_year, sources/roadmap, recurrence gig_slugs,
  classified/unclassified coverage (21% of palette uses need analysis, honest).
- tests: classifier refuses kit/source names; jungle_pads→pad guard. 60 green.
parent 3f2863d4
......@@ -13,8 +13,8 @@
"min": 80.0,
"max": 170.0,
"median": 120.0,
"mean": 120.2,
"morphing_tracks": 0,
"mean": 120.5,
"morphing_tracks": 1,
"histogram": {
"80": 8,
"90": 7,
......@@ -22,9 +22,9 @@
"110": 11,
"120": 23,
"130": 4,
"140": 7,
"140": 6,
"150": 2,
"160": 7,
"160": 8,
"170": 1
},
"ac_delta": [
......@@ -53,6 +53,12 @@
"delta": 60.0
},
{
"track": "You My Sunshine",
"score_bpm": 166.0,
"meta_bpm": 144,
"delta": 22.0
},
{
"track": "Contre visite",
"score_bpm": 90.0,
"meta_bpm": 80,
......@@ -77,12 +83,6 @@
"delta": 0.0
},
{
"track": "Septembre 1er",
"score_bpm": 120.0,
"meta_bpm": 120,
"delta": 0.0
},
{
"track": "Blue Gold",
"score_bpm": 140.0,
"meta_bpm": 140,
......@@ -323,12 +323,6 @@
"delta": 0.0
},
{
"track": "You My Sunshine",
"score_bpm": 144.0,
"meta_bpm": 144,
"delta": 0.0
},
{
"track": "Nouveau Soleil",
"score_bpm": 110.0,
"meta_bpm": 110,
......@@ -747,7 +741,7 @@
"track": "live/collab/baba/sept1.tidal",
"created": "2024-09-06",
"bpm": 120.0,
"morph": false
"morph": true
},
{
"name": "Drifting Soul",
......@@ -900,7 +894,7 @@
"name": "You My Sunshine",
"track": "live/midi/nova/dnb/liquid/you_my_sunshine.tidal",
"created": "2026-03-18",
"bpm": 144.0,
"bpm": 166.0,
"morph": false
},
{
......@@ -1080,7 +1074,7 @@
"n": 45,
"bpm_median": 117,
"bpm_min": 80.0,
"bpm_max": 165.0,
"bpm_max": 166.0,
"tracks": [
{
"name": "Sessions Break",
......@@ -1239,10 +1233,6 @@
"bpm": 138.0
},
{
"name": "You My Sunshine",
"bpm": 144.0
},
{
"name": "Nuit Agitée",
"bpm": 160.0
},
......@@ -1261,6 +1251,10 @@
{
"name": "Break the Loop",
"bpm": 165.0
},
{
"name": "You My Sunshine",
"bpm": 166.0
}
],
"distinctive_samples": [
......@@ -1453,23 +1447,28 @@
"sn": 5
},
"families": {
"break": 112,
"synth": 85,
"snare": 72,
"bass": 64,
"break": 67,
"bass": 65,
"hat": 39,
"keys": 37,
"kick": 27,
"vox": 20,
"vox": 24,
"fx": 18,
"perc": 9,
"pad": 5,
"pad": 7,
"lead": 4
},
"classified_uses": 445,
"unclassified_uses": 118,
"unclassified_names": 64,
"unclassified_top": {
"jazz": 37,
"drum": 5,
"armora": 3,
"dr": 3,
"90s_matrix": 3,
"drums_atari": 2,
"praise": 2,
"fsynth": 2,
"superfork": 2,
......@@ -1478,10 +1477,7 @@
"supersiren": 2,
"nujazz_beats120": 2,
"ifdrums": 1,
"rhadamanthe_melo": 1,
"ccc": 1,
"ghost": 1,
"jane_wang": 1
"rhadamanthe_melo": 1
},
"idioms_top": [
{
......@@ -1554,102 +1550,274 @@
{
"name": "Sunny Side Up",
"gigs": 11,
"track": "live/midi/nova/lounge/sunny_side_up.tidal"
"track": "live/midi/nova/lounge/sunny_side_up.tidal",
"gig_slugs": [
"2024/algolia-last-all-hands",
"2024/la-french-stack",
"2025/air-elementeuf",
"2025/algolia-rko",
"2025/bunker",
"2025/cosmicfest",
"2025/fairyteuf",
"2025/la-french-stack",
"2025/raise",
"2025/val-thorens",
"2026/montreuil-algorave"
]
},
{
"name": "Café Tiède",
"gigs": 10,
"track": "live/midi/nova/nujazz/cafe_tiede.tidal"
"track": "live/midi/nova/nujazz/cafe_tiede.tidal",
"gig_slugs": [
"2024/algolia-fdlm",
"2024/algolia-last-all-hands",
"2024/ccc-live",
"2024/cookie-collective-compilation",
"2024/la-french-stack",
"2024/toplap-20-years",
"2025/air-elementeuf",
"2025/algolia-rko",
"2025/la-french-stack",
"2025/raise"
]
},
{
"name": "Contre visite",
"gigs": 9,
"track": "live/midi/nova/ambient/contre_visite.tidal"
"track": "live/midi/nova/ambient/contre_visite.tidal",
"gig_slugs": [
"2022/bazurto",
"2023/cmny-2",
"2024/algolia-fdlm",
"2024/algolia-last-all-hands",
"2024/divin-live",
"2024/la-french-stack",
"2024/velociteuf",
"2025/algolia-rko",
"2025/la-french-stack"
]
},
{
"name": "Force Motrice",
"gigs": 9,
"track": "live/midi/nova/dnb/force_motrice.tidal"
"track": "live/midi/nova/dnb/force_motrice.tidal",
"gig_slugs": [
"2024/algolia-fdlm",
"2024/algolia-last-all-hands",
"2024/ccc-live",
"2024/cookie-collective-compilation",
"2024/la-french-stack",
"2024/toplap-20-years",
"2025/bunker",
"2025/la-french-stack",
"2025/raise"
]
},
{
"name": "Nuit Agitée",
"gigs": 8,
"track": "live/midi/nova/breaks/nuit_agitee.tidal"
"track": "live/midi/nova/breaks/nuit_agitee.tidal",
"gig_slugs": [
"2024/algolia-fdlm",
"2024/algolia-last-all-hands",
"2024/ccc-live",
"2024/cookie-collective-compilation",
"2024/toplap-20-years",
"2025/algolia-rko",
"2025/raise",
"2025/val-thorens"
]
},
{
"name": "Salut Nu",
"gigs": 8,
"track": "live/midi/nova/nujazz/salut_nu.tidal"
"track": "live/midi/nova/nujazz/salut_nu.tidal",
"gig_slugs": [
"2023/toplap-solstice",
"2024/algolia-fdlm",
"2024/ccc-live",
"2024/cookie-collective-compilation",
"2024/la-french-stack",
"2025/air-elementeuf",
"2025/la-french-stack",
"2025/raise"
]
},
{
"name": "Permanence",
"gigs": 7,
"track": "live/collab/raph/permanence.tidal"
"track": "live/collab/raph/permanence.tidal",
"gig_slugs": [
"2023/toplap-solstice",
"2024/algolia-fdlm",
"2024/algolia-last-all-hands",
"2024/la-french-stack",
"2024/toplap-20-years",
"2025/algolia-rko",
"2025/la-french-stack"
]
},
{
"name": "Invoque l'ete",
"gigs": 7,
"track": "live/midi/nova/lounge/invoque_ete.tidal"
"track": "live/midi/nova/lounge/invoque_ete.tidal",
"gig_slugs": [
"2022/bazurto",
"2023/cmny-2",
"2023/devcon23",
"2024/algolia-fdlm",
"2024/algolia-last-all-hands",
"2024/divin-live",
"2025/algolia-rko"
]
},
{
"name": "Café Glacé",
"gigs": 7,
"track": "live/midi/nova/nujazz/cafe_glace.tidal"
"track": "live/midi/nova/nujazz/cafe_glace.tidal",
"gig_slugs": [
"2024/algolia-fdlm",
"2024/ccc-live",
"2024/cookie-collective-compilation",
"2024/toplap-20-years",
"2025/air-elementeuf",
"2025/raise",
"2025/val-thorens"
]
},
{
"name": "Septembre 1er",
"gigs": 6,
"track": "live/collab/baba/sept1.tidal"
"track": "live/collab/baba/sept1.tidal",
"gig_slugs": [
"2025/air-elementeuf",
"2025/bunker",
"2025/cosmicfest",
"2025/raise",
"2025/val-thorens",
"2026/montreuil-algorave"
]
},
{
"name": "L'or Bleu",
"gigs": 6,
"track": "live/collab/mousquetaires/blue_gold.tidal"
"track": "live/collab/mousquetaires/blue_gold.tidal",
"gig_slugs": [
"2024/38c3-toilet",
"2024/algolia-last-all-hands",
"2024/cookie-collective-compilation",
"2025/raise",
"2025/val-thorens",
"2026/montreuil-algorave"
]
},
{
"name": "Alerte Verte",
"gigs": 6,
"track": "live/midi/nova/dnb/alerte_verte.tidal"
"track": "live/midi/nova/dnb/alerte_verte.tidal",
"gig_slugs": [
"2022/bazurto",
"2023/cmny-2",
"2024/ccc-live",
"2024/cookie-collective-compilation",
"2024/divin-live",
"2025/val-thorens"
]
},
{
"name": "Café Bouillant",
"gigs": 6,
"track": "live/midi/nova/nujazz/cafe_bouillant.tidal"
"track": "live/midi/nova/nujazz/cafe_bouillant.tidal",
"gig_slugs": [
"2023/toplap-solstice",
"2024/algolia-fdlm",
"2024/algolia-last-all-hands",
"2025/air-elementeuf",
"2025/algolia-rko",
"2025/val-thorens"
]
},
{
"name": "Acidule",
"gigs": 5,
"track": "live/collab/raph/acidule.tidal"
"track": "live/collab/raph/acidule.tidal",
"gig_slugs": [
"2024/38c3-toilet",
"2024/ccc-live",
"2024/cookie-collective-compilation",
"2025/bunker",
"2025/val-thorens"
]
},
{
"name": "Jeudi Drill",
"gigs": 5,
"track": "live/collab/raph/jeudrill.tidal"
"track": "live/collab/raph/jeudrill.tidal",
"gig_slugs": [
"2024/algolia-fdlm",
"2025/cosmicfest",
"2025/fairyteuf",
"2025/raise",
"2026/montreuil-algorave"
]
},
{
"name": "Something about Drums",
"gigs": 5,
"track": "live/midi/nova/dnb/something_about_drums.tidal"
"track": "live/midi/nova/dnb/something_about_drums.tidal",
"gig_slugs": [
"2024/algolia-last-all-hands",
"2024/la-french-stack",
"2025/algolia-rko",
"2025/la-french-stack",
"2025/raise"
]
},
{
"name": "Venons Ensemble",
"gigs": 5,
"track": "live/midi/nova/dnb/venons_ensemble.tidal"
"track": "live/midi/nova/dnb/venons_ensemble.tidal",
"gig_slugs": [
"2024/algolia-fdlm",
"2024/divin-live",
"2024/velociteuf",
"2025/cosmicfest",
"2025/la-french-stack"
]
},
{
"name": "PunkAChien",
"gigs": 4,
"track": "live/collab/raph/punkachien.tidal"
"track": "live/collab/raph/punkachien.tidal",
"gig_slugs": [
"2024/38c3-toilet",
"2025/raise",
"2025/val-thorens",
"2026/montreuil-algorave"
]
},
{
"name": "La fin de l'insouciance",
"gigs": 4,
"track": "live/midi/nova/beatober/oct_16_haunted_house_insouciance.tidal"
"track": "live/midi/nova/beatober/oct_16_haunted_house_insouciance.tidal",
"gig_slugs": [
"2023/toplap-solstice",
"2025/cosmicfest",
"2025/raise",
"2025/val-thorens"
]
},
{
"name": "Bain électrique",
"gigs": 4,
"track": "live/midi/nova/breaks/bain_electrique.tidal"
"track": "live/midi/nova/breaks/bain_electrique.tidal",
"gig_slugs": [
"2024/la-french-stack",
"2025/cosmicfest",
"2025/la-french-stack",
"2025/raise"
]
}
],
"collab": {
......@@ -1679,5 +1847,32 @@
"2026-01": 1,
"2026-04": 3,
"2026-05": 1
},
"sources": [
{
"key": "score",
"label": ".tidal scores",
"detail": "tempo, sample palette, phrases"
},
{
"key": "site",
"label": "site gig metadata",
"detail": "style tags, bpm, setlists"
},
{
"key": "git",
"label": "git history",
"detail": "track creation dates"
},
{
"key": "dirt",
"label": "Dirt-Samples links",
"detail": "sample-folder import dates"
}
],
"roadmap": [
"platform play counts",
"git edit counts",
"per-set durations"
]
}
\ No newline at end of file
......@@ -521,9 +521,9 @@ class ColorFamily(BaseModel):
SAMPLE_FAMILIES = [
ColorFamily(key="kick", label="Kick", glyph="●", hue=25, match=["kick", "kik", "bd", "808bd", "909", "bassdrum"]),
ColorFamily(key="snare", label="Snare", glyph="◆", hue=50, match=["snare", "sn", "sd", "clap", "claps", "cp", "rim", "rs"]),
ColorFamily(key="perc", label="Perc", glyph="▴", hue=80, match=["perc", "conga", "bongo", "tom", "clave", "shaker", "tabla", "cowbell", "drum"]),
ColorFamily(key="perc", label="Perc", glyph="▴", hue=80, match=["perc", "conga", "bongo", "tom", "clave", "shaker", "tabla", "cowbell"]),
ColorFamily(key="hat", label="Hat", glyph="✦", hue=110, match=["hat", "hh", "ho", "oh", "ch", "hihat", "cymbal", "cym", "ride", "crash"]),
ColorFamily(key="break", label="Break", glyph="≈", hue=150, match=["break", "amen", "loop", "jungle", "dnb", "jazz", "breaks165", "fbreak"]),
ColorFamily(key="break", label="Break", glyph="≈", hue=150, match=["break", "amen", "breaks165", "fbreak"]),
ColorFamily(key="pad", label="Pad", glyph="◌", hue=180, match=["pad", "drone", "choir", "string", "ambient", "atmos", "atm", "airport", "trance"]),
ColorFamily(key="keys", label="Keys", glyph="♬", hue=205, match=["key", "keys", "piano", "rhodes", "epiano", "organ", "fpiano", "qstab", "cbow", "cpluck", "clav", "marimba", "mandolin", "guitar", "organ", "forgan"]),
ColorFamily(key="lead", label="Lead", glyph="♪", hue=230, match=["lead", "arp", "pluck", "stab", "blip", "saw", "square", "brass", "sax", "horn", "trump", "tromb"]),
......@@ -542,12 +542,14 @@ _STRONG_CONTAINS = [("kick", "kick"), ("snare", "snare"), ("clap", "snare"),
("piano", "keys"), ("rhodes", "keys"), ("vocal", "vox"),
("voice", "vox"), ("dialog", "vox")]
def classify_sample_family(name: str):
"""Canonical sample-name → family key (or None). The SSOT classifier every surface
should use (DRY). Token-aware: a name like `vec1_snare` or `nujazz_bass125` resolves
by its embedded instrument token, not just its prefix. Sample-world model, not a
measured-register claim. Returns the family `key` or None when genuinely unknown."""
should use (DRY). It classifies ONLY by instrument tokens the NAME actually encodes
(`vec1_snare`→snare, `moogBass`→bass) — a deliberately conservative sample-world
claim, NOT a measured-register one. A bare folder/kit/source name (`jazz`, `gretsch`,
`house`) carries no reliable instrument signal — a folder may be one family, a
heterogeneous kit indexed by `:n`, or a demucs grab from some track — so it returns
None. Resolving those is an analysis job (per-sample/index), never a name guess."""
s = name.lower()
toks = [t for t in re.split(r"[_\-0-9]+", s) if t]
for f in SAMPLE_FAMILIES:
......
......@@ -88,7 +88,6 @@ def test_sample_classifier_examples():
assert c("808bd") == "kick"
assert c("hats") == "hat"
assert c("meth_bass") == "bass"
assert c("jazz") == "break"
assert c("piano") == "keys"
# token-aware / drum-machine / embedded (the coverage-improving cases)
assert c("h2ogmhh") == "hat" # DM suffix, gated by 'gm'
......@@ -107,6 +106,20 @@ def test_classifier_does_not_overreach():
assert M.classify_sample_family(unknown) is None
def test_classifier_refuses_kit_and_source_names():
"""A folder name that doesn't encode an instrument must NOT be force-classified.
Kits (jazz:0=kick, :1=snare…), drum machines, and folders named after a SOURCE
song carry no reliable family signal — only per-sample analysis can label them."""
c = M.classify_sample_family
for kit in ("jazz", "gretsch", "drum", "drumtraks", "techno"): # multisample kits
assert c(kit) is None, kit
for src in ("wap", "take5", "the_revolution", "xplosive", "rampleS13"): # sampled
assert c(src) is None, src
# but a name that DOES encode the instrument still resolves
assert c("jungle_breaks") == "break"
assert c("jungle_pads") == "pad" # was wrongly 'break' via the genre token
def test_style_normalization():
assert M.norm_style("nu-jazz") == M.norm_style("nujazz") == "nujazz"
assert M.norm_style("breakbeat") == M.norm_style("breaks") == "breaks"
......
......@@ -57,6 +57,9 @@ def parse_tempo(src):
"""Return {bpm, lo, hi, morph} from a .tidal source, or None. bpm = primary tempo."""
if not src:
return None
# drop Tidal line comments (`-- …`) so commented-out experiments don't count
# (e.g. ton_numero has a `-- # cps (range …)` morpher that was never live)
src = re.sub(r"--[^\n]*", "", src)
# all setcps / cps / cpsbus / "# cps" statements
cands = []
for m in re.finditer(r"(?:setcps|cpsbus\s*\d+|#\s*cps|\bcps)\s*\(?(.+)", src):
......@@ -75,9 +78,17 @@ def parse_tempo(src):
cands.append({"bpm": float(bare.group(1)) * 240, "lo": None, "hi": None, "morph": False})
if not cands:
return None
# prefer a fixed setcps; fall back to first morphing range
# primary BPM = the fixed setcps (the declared base tempo) if there is one,
# else the first range's midpoint. But a track that ALSO has a `cps (range …)`
# morpher is a morpher (sept1 declares 120 then sweeps 60→180): keep the base
# bpm for sorting, flag the morph, and carry its span.
fixed = [c for c in cands if not c["morph"]]
return (fixed or cands)[0]
morphs = [c for c in cands if c["morph"]]
out = dict((fixed or cands)[0])
if morphs:
out["morph"] = True
out["lo"], out["hi"] = morphs[0]["lo"], morphs[0]["hi"]
return out
# ── sample-family classifier (DRY: the canonical SSOT classifier in models.py) ─
......@@ -303,7 +314,8 @@ def build():
# recurrence
recurrence = sorted(({"name": t["name"], "gigs": len(t["gigs"]),
"track": t["track"]} for t in T),
"track": t["track"], "gig_slugs": sorted(t.get("gigs", []))}
for t in T),
key=lambda x: -x["gigs"])
# collab dimension (live/collab/<who>/…)
......@@ -336,6 +348,11 @@ def build():
"styles": dict(sty.most_common()),
"palette_top": dict(snd.most_common(25)),
"families": dict(fam.most_common()),
# honest coverage: a sample-NAME only encodes an instrument sometimes; kits,
# drum machines and source-named chops (wap, take5…) need analysis, not a guess.
"classified_uses": sum(fam.values()),
"unclassified_uses": sum(unclassified.values()),
"unclassified_names": len(unclassified),
"unclassified_top": dict(unclassified.most_common(15)),
"idioms_top": [{"norm": p["norm"], "n_tracks": p["n_tracks"]} for p in shared[:15]],
"idioms_counts": {"shared": len(shared), "repeated": len(repeated),
......@@ -343,6 +360,18 @@ def build():
"recurrence_top": recurrence[:20],
"collab": dict(collab.most_common()),
"vocabulary_growth": dict(sorted(vocabulary_growth().items())),
# which inputs fed this build (shown in the viz footer; append as we add more)
"sources": [
{"key": "score", "label": ".tidal scores",
"detail": "tempo, sample palette, phrases"},
{"key": "site", "label": "site gig metadata",
"detail": "style tags, bpm, setlists"},
{"key": "git", "label": "git history", "detail": "track creation dates"},
{"key": "dirt", "label": "Dirt-Samples links",
"detail": "sample-folder import dates"},
],
# near-future sources (kept honest: shown as "coming", not faked)
"roadmap": ["platform play counts", "git edit counts", "per-set durations"],
}
return report
......
......@@ -228,8 +228,7 @@
"clave",
"shaker",
"tabla",
"cowbell",
"drum"
"cowbell"
]
},
{
......@@ -278,10 +277,6 @@
"match": [
"break",
"amen",
"loop",
"jungle",
"dnb",
"jazz",
"breaks165",
"fbreak"
]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment