Commit 21d003dd by PLN (Algolia)

fix(eda holes): canonical sample classifier + style normalization

- models.py: add classify_sample_family() SSOT (token-aware, DM-suffix gated,
  strong-contains fallback) + extend SAMPLE_FAMILIES cues → unclassified 89→34
  uses (~93% classified); honest unknowns (armora, 90s_matrix) stay None.
- models.py: STYLE_ALIASES + norm_style() merges nu-jazz/nujazz, breaks/breakbeat,
  chip/chiptune, hip-hop/hip so the style chart is honest.
- tide_eda + tests use the canonical classifier (DRY); +3 mechanical tests
  (token cases, no-overreach incl. the shil-'oh' FP, style norm). 53 pass.
- regen tokens (cues feed match[] arrays).
parent f7132e2d
......@@ -1148,27 +1148,22 @@
"2026": 3
},
"styles": {
"dnb": 43,
"nujazz": 34,
"dnb": 45,
"breaks": 41,
"nujazz": 40,
"techno": 31,
"breaks": 31,
"lounge": 24,
"ambient": 11,
"breakbeat": 10,
"lofi": 7,
"nu-jazz": 6,
"acid": 6,
"drill": 5,
"acid": 4,
"punk": 3,
"chiptune": 2,
"collab": 2,
"hip-hop": 2,
"punk-dnb": 2,
"acid-techno": 2,
"hiphop": 2,
"hybrid": 2,
"chiptune": 1,
"funk": 1,
"downtempo": 1,
"chip": 1,
"dance": 1
},
"palette_top": {
......@@ -1199,34 +1194,35 @@
"sn": 5
},
"families": {
"break": 106,
"synth": 57,
"snare": 54,
"bass": 51,
"kick": 23,
"fx": 19,
"keys": 18,
"hat": 13,
"vox": 5,
"pad": 1,
"lead": 1
"break": 112,
"synth": 85,
"snare": 72,
"bass": 64,
"hat": 39,
"keys": 37,
"kick": 27,
"vox": 20,
"fx": 18,
"perc": 9,
"pad": 5,
"lead": 4
},
"unclassified_top": {
"h2ogmhh": 20,
"90s_synatm": 9,
"giorgio_syn": 7,
"vec1_snare": 7,
"cbow": 6,
"fbass": 6,
"drum": 5,
"vec1_acid": 5,
"fguitar": 5,
"h2ogmsn": 4,
"vec2_synth_acid": 3,
"armora": 3,
"rhadamanthe_vocal": 3,
"h2ogmcp": 3,
"vec1_claps": 3
"dr": 3,
"90s_matrix": 3,
"praise": 2,
"fsynth": 2,
"superfork": 2,
"90s_megafx": 2,
"shiloh": 2,
"supersiren": 2,
"nujazz_beats120": 2,
"ifdrums": 1,
"rhadamanthe_melo": 1,
"ccc": 1,
"ghost": 1,
"jane_wang": 1
},
"idioms_top": [
{
......
......@@ -15,6 +15,7 @@ read. Enums serialize as their string value.
"""
from __future__ import annotations
import re
from datetime import date
from enum import Enum
from typing import Optional
......@@ -519,19 +520,67 @@ class ColorFamily(BaseModel):
# conceptual identity, NOT a measured register claim (that's ROLE_FAMILIES, by audio).
SAMPLE_FAMILIES = [
ColorFamily(key="kick", label="Kick", glyph="●", hue=25, match=["kick", "kik", "bd", "808bd", "909", "bassdrum"]),
ColorFamily(key="snare", label="Snare", glyph="◆", hue=50, match=["snare", "sn", "sd", "clap", "cp", "rim", "rs"]),
ColorFamily(key="perc", label="Perc", glyph="▴", hue=80, match=["perc", "conga", "bongo", "tom", "clave", "shaker", "tabla", "cowbell"]),
ColorFamily(key="hat", label="Hat", glyph="✦", hue=110, match=["hat", "hh", "oh", "ch", "hihat", "cymbal", "cym", "ride", "crash"]),
ColorFamily(key="break", label="Break", glyph="≈", hue=150, match=["break", "amen", "loop", "jungle", "dnb", "jazz"]),
ColorFamily(key="pad", label="Pad", glyph="◌", hue=180, match=["pad", "drone", "choir", "string", "ambient", "atmos"]),
ColorFamily(key="keys", label="Keys", glyph="♬", hue=205, match=["key", "keys", "piano", "rhodes", "epiano", "organ", "fpiano", "qstab", "nujazz"]),
ColorFamily(key="lead", label="Lead", glyph="♪", hue=230, match=["lead", "arp", "pluck", "stab", "blip", "saw", "square"]),
ColorFamily(key="synth", label="Synth", glyph="◈", hue=260, match=["synth", "commodore", "chip", "fm", "moog", "reese"]),
ColorFamily(key="bass", label="Bass", glyph="▂", hue=290, match=["bass", "sub", "808", "acid", "wobble", "meth_bass", "ramplem", "bassline"]),
ColorFamily(key="fx", label="FX", glyph="✺", hue=320, match=["fx", "riser", "sweep", "noise", "impact", "downlifter", "uplifter", "glitch", "vinyl", "foley"]),
ColorFamily(key="vox", label="Vox", glyph="◍", hue=355, match=["vox", "vocal", "voc", "acap", "speech", "voice"]),
ColorFamily(key="snare", label="Snare", glyph="◆", hue=50, match=["snare", "sn", "sd", "clap", "claps", "cp", "rim", "rs"]),
ColorFamily(key="perc", label="Perc", glyph="▴", hue=80, match=["perc", "conga", "bongo", "tom", "clave", "shaker", "tabla", "cowbell", "drum"]),
ColorFamily(key="hat", label="Hat", glyph="✦", hue=110, match=["hat", "hh", "ho", "oh", "ch", "hihat", "cymbal", "cym", "ride", "crash"]),
ColorFamily(key="break", label="Break", glyph="≈", hue=150, match=["break", "amen", "loop", "jungle", "dnb", "jazz", "breaks165", "fbreak"]),
ColorFamily(key="pad", label="Pad", glyph="◌", hue=180, match=["pad", "drone", "choir", "string", "ambient", "atmos", "atm", "airport", "trance"]),
ColorFamily(key="keys", label="Keys", glyph="♬", hue=205, match=["key", "keys", "piano", "rhodes", "epiano", "organ", "fpiano", "qstab", "cbow", "cpluck", "clav", "marimba", "mandolin", "guitar", "organ", "forgan"]),
ColorFamily(key="lead", label="Lead", glyph="♪", hue=230, match=["lead", "arp", "pluck", "stab", "blip", "saw", "square", "brass", "sax", "horn", "trump", "tromb"]),
ColorFamily(key="synth", label="Synth", glyph="◈", hue=260, match=["synth", "syn", "commodore", "chip", "fm", "moog", "reese", "giorgio", "electro", "cs80", "vec"]),
ColorFamily(key="bass", label="Bass", glyph="▂", hue=290, match=["bass", "sub", "808", "acid", "wobble", "meth_bass", "ramplem", "bassline", "fbass"]),
ColorFamily(key="fx", label="FX", glyph="✺", hue=320, match=["fx", "riser", "risers", "sweep", "noise", "impact", "downlifter", "uplifter", "glitch", "vinyl", "foley"]),
ColorFamily(key="vox", label="Vox", glyph="◍", hue=355, match=["vox", "vocal", "voc", "acap", "speech", "voice", "movie", "dialog", "dialogs"]),
]
# Drum-machine name SUFFIXES (h2ogmhh, …) and strong embedded tokens, for names that
# don't lead with their family cue. Order matters: these run after the cue loop.
_DM_SUFFIX = {"hh": "hat", "oh": "hat", "ch": "hat", "sn": "snare", "sd": "snare",
"cp": "snare", "bd": "kick", "cy": "hat"}
_STRONG_CONTAINS = [("kick", "kick"), ("snare", "snare"), ("clap", "snare"),
("bass", "bass"), ("hat", "hat"), ("guitar", "keys"),
("piano", "keys"), ("rhodes", "keys"), ("vocal", "vox"),
("voice", "vox"), ("dialog", "vox")]
def classify_sample_family(name: str):
"""Canonical sample-name → family key (or None). The SSOT classifier every surface
should use (DRY). Token-aware: a name like `vec1_snare` or `nujazz_bass125` resolves
by its embedded instrument token, not just its prefix. Sample-world model, not a
measured-register claim. Returns the family `key` or None when genuinely unknown."""
s = name.lower()
toks = [t for t in re.split(r"[_\-0-9]+", s) if t]
for f in SAMPLE_FAMILIES:
for m in f.match:
if s == m or s.startswith(m) or m in toks or \
(len(m) >= 3 and any(t.startswith(m) for t in toks)):
return f.key
# drum-machine suffixes only on DM-style names (h2ogmhh, tr909sn) — not "shiloh"
if re.search(r"gm|dm|tr\d|rytm|\dmt", s) or re.search(r"\d(hh|oh|ch|sn|sd|cp|bd|cy)$", s):
for suf, fam in _DM_SUFFIX.items():
if s.endswith(suf):
return fam
for sub, fam in _STRONG_CONTAINS: # gtkick, fbass, weird_dialogs, …
if sub in s:
return fam
return None
# ── style label normalization (corner C metadata is hand-typed & fragmented) ──
STYLE_ALIASES = {
"nu-jazz": "nujazz", "nujazz": "nujazz", "breakbeat": "breaks", "breaks": "breaks",
"chip": "chiptune", "chiptune": "chiptune", "hip-hop": "hiphop", "hip": "hiphop",
"acid-techno": "acid", "punk-dnb": "dnb", "lofi": "lofi", "lo-fi": "lofi",
}
def norm_style(s: str) -> str:
"""Canonicalize a hand-typed style label. Unknown labels pass through lowercased."""
if not s:
return s
k = s.strip().lower()
return STYLE_ALIASES.get(k, k)
# The whole vocabulary, for gen_tokens (and anyone who needs the fleet paint).
ONTOLOGY = {
"role": ROLE_FAMILIES,
......
......@@ -82,17 +82,32 @@ def test_tokens_validate_against_model():
def test_sample_classifier_examples():
"""The lexical sample-family rules should land obvious cases (coherent model)."""
fam = {f.key: f for f in M.SAMPLE_FAMILIES}
def classify(name):
s = name.lower()
for f in M.SAMPLE_FAMILIES:
if any(s == m or s.startswith(m) for m in f.match):
return f.key
return None
assert classify("kick") == "kick"
assert classify("808bd") == "kick"
assert classify("hats") == "hat"
assert classify("meth_bass") == "bass"
assert classify("jazz") == "break"
assert classify("superpiano") is None or classify("piano") == "keys"
"""The canonical SSOT classifier lands obvious cases AND the token-aware ones."""
c = M.classify_sample_family
assert c("kick") == "kick"
assert c("808bd") == "kick"
assert c("hats") == "hat"
assert c("meth_bass") == "bass"
assert c("jazz") == "break"
assert c("piano") == "keys"
# token-aware / drum-machine / embedded (the coverage-improving cases)
assert c("h2ogmhh") == "hat" # DM suffix, gated by 'gm'
assert c("vec1_snare") == "snare" # embedded instrument token
assert c("nujazz_bass125") == "bass" # token wins over 'nujazz'
assert c("nujazz_keys120") == "keys"
assert c("fbass") == "bass"
assert c("weird_dialogs") == "vox"
assert c("ho") == "hat"
def test_classifier_does_not_overreach():
"""Honest unknowns stay unknown — the 'shil-OH' false-positive must not return."""
assert M.classify_sample_family("shiloh") != "hat"
for unknown in ("armora", "90s_matrix"):
assert M.classify_sample_family(unknown) is None
def test_style_normalization():
assert M.norm_style("nu-jazz") == M.norm_style("nujazz") == "nujazz"
assert M.norm_style("breakbeat") == M.norm_style("breaks") == "breaks"
assert M.norm_style("techno") == "techno" # unknown passes through
......@@ -80,13 +80,8 @@ def parse_tempo(src):
return (fixed or cands)[0]
# ── sample-family classifier (DRY: same cues the fleet color language uses) ────
def sample_family(name):
s = name.lower()
for f in M.SAMPLE_FAMILIES:
if any(s == m or s.startswith(m) for m in f.match):
return f.key
return None
# ── sample-family classifier (DRY: the canonical SSOT classifier in models.py) ─
sample_family = M.classify_sample_family
def gig_year_index():
......@@ -110,7 +105,7 @@ def metadata_bpm():
for t in (d.get("tracks") or []):
rows.append({"gig": slug, "date": (d.get("date") or "")[:10],
"file": t.get("file"), "name": t.get("name"),
"bpm": t.get("bpm"), "style": t.get("style")})
"bpm": t.get("bpm"), "style": M.norm_style(t.get("style"))})
return rows
......
......@@ -199,6 +199,7 @@
"sn",
"sd",
"clap",
"claps",
"cp",
"rim",
"rs"
......@@ -227,7 +228,8 @@
"clave",
"shaker",
"tabla",
"cowbell"
"cowbell",
"drum"
]
},
{
......@@ -248,6 +250,7 @@
"match": [
"hat",
"hh",
"ho",
"oh",
"ch",
"hihat",
......@@ -278,7 +281,9 @@
"loop",
"jungle",
"dnb",
"jazz"
"jazz",
"breaks165",
"fbreak"
]
},
{
......@@ -302,7 +307,10 @@
"choir",
"string",
"ambient",
"atmos"
"atmos",
"atm",
"airport",
"trance"
]
},
{
......@@ -329,7 +337,14 @@
"organ",
"fpiano",
"qstab",
"nujazz"
"cbow",
"cpluck",
"clav",
"marimba",
"mandolin",
"guitar",
"organ",
"forgan"
]
},
{
......@@ -354,7 +369,12 @@
"stab",
"blip",
"saw",
"square"
"square",
"brass",
"sax",
"horn",
"trump",
"tromb"
]
},
{
......@@ -374,11 +394,16 @@
],
"match": [
"synth",
"syn",
"commodore",
"chip",
"fm",
"moog",
"reese"
"reese",
"giorgio",
"electro",
"cs80",
"vec"
]
},
{
......@@ -404,7 +429,8 @@
"wobble",
"meth_bass",
"ramplem",
"bassline"
"bassline",
"fbass"
]
},
{
......@@ -425,6 +451,7 @@
"match": [
"fx",
"riser",
"risers",
"sweep",
"noise",
"impact",
......@@ -456,7 +483,10 @@
"voc",
"acap",
"speech",
"voice"
"voice",
"movie",
"dialog",
"dialogs"
]
}
]
......
......@@ -199,6 +199,7 @@
"sn",
"sd",
"clap",
"claps",
"cp",
"rim",
"rs"
......@@ -227,7 +228,8 @@
"clave",
"shaker",
"tabla",
"cowbell"
"cowbell",
"drum"
]
},
{
......@@ -248,6 +250,7 @@
"match": [
"hat",
"hh",
"ho",
"oh",
"ch",
"hihat",
......@@ -278,7 +281,9 @@
"loop",
"jungle",
"dnb",
"jazz"
"jazz",
"breaks165",
"fbreak"
]
},
{
......@@ -302,7 +307,10 @@
"choir",
"string",
"ambient",
"atmos"
"atmos",
"atm",
"airport",
"trance"
]
},
{
......@@ -329,7 +337,14 @@
"organ",
"fpiano",
"qstab",
"nujazz"
"cbow",
"cpluck",
"clav",
"marimba",
"mandolin",
"guitar",
"organ",
"forgan"
]
},
{
......@@ -354,7 +369,12 @@
"stab",
"blip",
"saw",
"square"
"square",
"brass",
"sax",
"horn",
"trump",
"tromb"
]
},
{
......@@ -374,11 +394,16 @@
],
"match": [
"synth",
"syn",
"commodore",
"chip",
"fm",
"moog",
"reese"
"reese",
"giorgio",
"electro",
"cs80",
"vec"
]
},
{
......@@ -404,7 +429,8 @@
"wobble",
"meth_bass",
"ramplem",
"bassline"
"bassline",
"fbass"
]
},
{
......@@ -425,6 +451,7 @@
"match": [
"fx",
"riser",
"risers",
"sweep",
"noise",
"impact",
......@@ -456,7 +483,10 @@
"voc",
"acap",
"speech",
"voice"
"voice",
"movie",
"dialog",
"dialogs"
]
}
]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment