Commit 9a428b6e by PLN (Algolia)

feat(backlog_setlists v2): explicit ends, label-strip, leet fallback + tests

- ANCHORS support (slug, end_line) to bound bleeders (mephisteuf/opal-2025/
  38c3-toilet) into clean blocks — no more swallowing neighbour setlists.
- clean_track_line: strip leading set-position labels (Intro:/Finale:/Sunset),
  keep 'Label: Name' / drop 'Name: gloss'; recovers the LIVE-Noctambule pattern.
- leet_fold fallback (B00K→book, T01l3ts→toilets, G00D→good) — resolves only
  after a straight miss so genuine digits (sept1) still match first.
- 9 gap gigs recovered (opal-2024 14/14, algolia-fdlm 26/26, 39c3 13/14…).
- 6 mechanical tests (leet, label, gloss, bpm/transition, ascii/comment skip,
  real-backlog coverage guard). 59 pass.
parent 21d003dd
...@@ -30,22 +30,26 @@ AS_OF = "2026-06-06" ...@@ -30,22 +30,26 @@ AS_OF = "2026-06-06"
# ── Authored gig anchors: (1-based line of the header in backlog.md) → site slug ── # ── Authored gig anchors: (1-based line of the header in backlog.md) → site slug ──
# `confirm:`-prefixed slugs are NOT yet trusted (ambiguous header / needs PLN's ear); # `confirm:`-prefixed slugs are NOT yet trusted (ambiguous header / needs PLN's ear);
# they are parsed and reported but withheld from the emitted artifact. # they are parsed and reported but withheld from the emitted artifact.
# value = slug OR (slug, end_line) to bound a block that would otherwise bleed
# into the next densely-packed setlist (the backlog has no blank-gap between them).
ANCHORS = { ANCHORS = {
1441: "2025/ensad", # "# Set @ENSAD 2025" 1441: "2025/ensad", # "# Set @ENSAD 2025"
1816: "2026/le-vortex", # "# VORTEX26" 1816: "2026/le-vortex", # "# VORTEX26"
1031: "2024/opal-festival-2024", # "## Opal <3🦉" / OPAL 2024 1031: "2024/opal-festival-2024", # "## Opal <3🦉" / OPAL 2024
1632: "2025/opal-festival-2025", # "# OPAL 2025" 1632: ("2025/opal-festival-2025", 1667), # "# OPAL 2025" (bleeds into Latin Heritage)
710: "2023/mephisteuf", # "## Live @MephisTeuf" 710: ("2023/mephisteuf", 735), # "## Live @MephisTeuf" (commented block = cut)
1194: "2024/38c3-house-of-tea", # "#### Day 2 - HOUSE OF Tea" (under 38C3) 1194: "2024/38c3-house-of-tea", # "#### Day 2 - HOUSE OF Tea" (under 38C3)
1242: "2024/38c3-toilet", # "#### Day 3 - Hardcore in the Toilet Club" 1242: ("2024/38c3-toilet", 1253), # cross-check only (already has tracks.json)
1786: "2025/39c3-house-of-tea", # "# CCC 39c3 ALGORAVE ... Pour Un Thé Dansant" 1786: "2025/39c3-house-of-tea", # "# CCC 39c3 ALGORAVE ... Pour Un Thé Dansant"
1169: "2024/toplap-solstice-2024", # "### TopLap Solstice Stream 2024" 1169: "2024/toplap-solstice-2024", # "### TopLap Solstice Stream 2024"
988: "confirm:2024/algolia-fdlm", # "## Algolia FDLM2024" — already has tracks.json 988: "confirm:2024/algolia-fdlm", # "## Algolia FDLM2024" — already has tracks.json
1550: "confirm:2025/algorave-lyon", # "GZ 2025 LYON ALGORAVE" then "# LABENNE LIVE" (?) 1550: "confirm:2025/algorave-lyon", # "GZ 2025 LYON ALGORAVE" then "# LABENNE LIVE" (?)
1831: "confirm:2026/ete-surprise", # "## SURPRISE IT'S A DANCEFLOOR" — slug guess 1831: "confirm:2026/ete-surprise", # "## SURPRISE IT'S A DANCEFLOOR" — slug guess
} }
# Gigs still unanchored (no clear backlog block found): 2022/algolia-fdlm-2022, # Still unanchored (gaps for #66, need PLN's ear / SC confirmation): 2022/algolia-fdlm-2022,
# 2024/38c3-chaos-music-club, 2025/39c3-toilet-rave, 2025/toplap-fromscratch-2025. # 2024/38c3-chaos-music-club, 2025/39c3-toilet-rave, 2025/toplap-fromscratch-2025. The
# backlog also holds many UNMAPPED setlists (Latin Heritage, NON-ANNIVERSAIRE, MICROLIVE,
# SovieTeuf, CCC Day-1, RAISE afterparty, Garibaldi…) whose slugs need confirming.
# Lines that are decoration / structure, never tracks. # Lines that are decoration / structure, never tracks.
_ASCII_ART = re.compile(r"[⠀-⣿▓▒░█▀▄■◆◢◣◤◥╔╗╚╝║═]") _ASCII_ART = re.compile(r"[⠀-⣿▓▒░█▀▄■◆◢◣◤◥╔╗╚╝║═]")
...@@ -65,6 +69,15 @@ def norm_name(s: str) -> str: ...@@ -65,6 +69,15 @@ def norm_name(s: str) -> str:
return re.sub(r"[^a-z0-9]+", "", deaccent(s).lower()) return re.sub(r"[^a-z0-9]+", "", deaccent(s).lower())
_LEET = str.maketrans({"0": "o", "1": "i", "3": "e", "4": "a", "5": "s", "7": "t"})
def leet_fold(s: str) -> str:
"""Fold ParVagues' leetspeak (B00K, T01l3ts, G00D, L1VEC0DE) → letters. Used only
as a FALLBACK so genuine digits (sept1, 165) still match straight first."""
return s.translate(_LEET)
def build_name_index(): def build_name_index():
"""norm(name) → .tidal path, from the catalog's existing alias set (DRY).""" """norm(name) → .tidal path, from the catalog's existing alias set (DRY)."""
cv = json.load(open(CATALOG_VIEW)) cv = json.load(open(CATALOG_VIEW))
...@@ -102,7 +115,12 @@ def clean_track_line(raw: str): ...@@ -102,7 +115,12 @@ def clean_track_line(raw: str):
body = re.sub(r"\([^)]*\)", "", body) # (Chloe cover) etc body = re.sub(r"\([^)]*\)", "", body) # (Chloe cover) etc
body = re.sub(r"\bfeat\b.*$", "", body, flags=re.I) body = re.sub(r"\bfeat\b.*$", "", body, flags=re.I)
body = body.replace("*", "").replace("_", "").replace("`", "") body = body.replace("*", "").replace("_", "").replace("`", "")
body = re.sub(r":\s*.*$", "", body) # "Café tiède: note" → drop gloss after colon # strip a leading set-position label ("Intro: X", "Finale: X", "Sunset X")
body = _LABEL.sub("", body)
# remaining colon: "Label: Name" → keep Name; "Name: gloss" → keep Name
if ":" in body:
left, right = (p.strip() for p in body.split(":", 1))
body = right if right and (right[:1].isupper() and len(right.split()) <= 5) else left
# remove emoji / symbol chars, keep letters/digits/space/'-/& # remove emoji / symbol chars, keep letters/digits/space/'-/&
body = "".join(ch for ch in body body = "".join(ch for ch in body
if ch.isalnum() or ch in " '-/&." or unicodedata.category(ch).startswith("L")) if ch.isalnum() or ch in " '-/&." or unicodedata.category(ch).startswith("L"))
...@@ -118,14 +136,21 @@ def clean_track_line(raw: str): ...@@ -118,14 +136,21 @@ def clean_track_line(raw: str):
return body, bpm, transition, True return body, bpm, transition, True
def block_span(lines, start_idx): def _slug_end(val):
"""ANCHORS value → (slug, end_line|None)."""
return val if isinstance(val, tuple) else (val, None)
def block_span(lines, start_idx, end_line=None):
"""Collect a gig block. Headers in backlog.md are DECORATIVE (stylized banners, """Collect a gig block. Headers in backlog.md are DECORATIVE (stylized banners,
repeated `# PARVAGUES`, sub-labels) not hierarchical, so we don't scope by level. repeated `# PARVAGUES`, sub-labels) not hierarchical, so we don't scope by level.
Instead: from the anchor, walk to the next ANCHOR, but terminate early once the From the anchor, walk to the next ANCHOR (or an explicit end_line), terminating
setlist proper ends — a run of ≥3 blank lines after we've collected ≥2 items early once the setlist proper ends — a run of ≥3 blank lines after we've collected
(journal prose resumes). Decorative headers are kept as section context, not breaks.""" ≥2 items (journal prose resumes). Decorative headers are kept as section context."""
anchor_lines = sorted(k - 1 for k in ANCHORS) anchor_lines = sorted(k - 1 for k in ANCHORS)
hard_end = next((a for a in anchor_lines if a > start_idx), len(lines)) hard_end = next((a for a in anchor_lines if a > start_idx), len(lines))
if end_line is not None:
hard_end = min(hard_end, end_line - 1)
out, items, blanks = [], 0, 0 out, items, blanks = [], 0, 0
for raw in lines[start_idx + 1:hard_end]: for raw in lines[start_idx + 1:hard_end]:
if not raw.strip(): if not raw.strip():
...@@ -142,18 +167,23 @@ def block_span(lines, start_idx): ...@@ -142,18 +167,23 @@ def block_span(lines, start_idx):
# transition tails appended to a track name: "-> 5+7", " 0 - / 11", "- 9" # transition tails appended to a track name: "-> 5+7", " 0 - / 11", "- 9"
_TRANS_TAIL = re.compile(r"\s*(->|[-–])\s*[\d\s./+,]*$|\s+[\d][\d\s./+,>-]*$") _TRANS_TAIL = re.compile(r"\s*(->|[-–])\s*[\d\s./+,]*$|\s+[\d][\d\s./+,>-]*$")
# leading set-position label to strip ("Intro: ", "Finale ", "Sunset — ", "Day 1 ")
_LABEL = re.compile(
r"^\s*(intro|outro|final[e]?|bonus|rappel|sunset|sunrise|darkness|pause|"
r"d[eé]but|fin|end|day\s*\d+)\b\s*[:.\-–]?\s*", re.I)
def parse(lines, name_idx): def parse(lines, name_idx):
gigs = {} gigs = {}
for ln1, slug in sorted(ANCHORS.items()): for ln1, val in sorted(ANCHORS.items()):
slug, end_line = _slug_end(val)
confirm = slug.startswith("confirm:") confirm = slug.startswith("confirm:")
real_slug = slug.split(":", 1)[1] if confirm else slug real_slug = slug.split(":", 1)[1] if confirm else slug
i = ln1 - 1 i = ln1 - 1
header = lines[i].strip() header = lines[i].strip()
section = None section = None
tracks, unmatched = [], [] tracks, unmatched = [], []
for raw in block_span(lines, i): for raw in block_span(lines, i, end_line):
hm = re.match(r"\s*#{1,6}\s+(.+)$", raw) hm = re.match(r"\s*#{1,6}\s+(.+)$", raw)
if hm: # subsection label if hm: # subsection label
section = re.sub(r"[^\w\s'-]", "", deaccent(hm.group(1))).strip() or None section = re.sub(r"[^\w\s'-]", "", deaccent(hm.group(1))).strip() or None
...@@ -163,7 +193,8 @@ def parse(lines, name_idx): ...@@ -163,7 +193,8 @@ def parse(lines, name_idx):
continue continue
if _SECTION_HINT.match(name) and norm_name(name) not in name_idx: if _SECTION_HINT.match(name) and norm_name(name) not in name_idx:
continue continue
track = name_idx.get(norm_name(name)) nm = norm_name(name)
track = name_idx.get(nm) or name_idx.get(leet_fold(nm))
entry = {"raw": name, "track": track, "bpm": bpm, "transition": transition, "section": section} entry = {"raw": name, "track": track, "bpm": bpm, "transition": transition, "section": section}
(tracks if track else unmatched).append(entry) (tracks if track else unmatched).append(entry)
if not track: if not track:
......
"""Backlog setlist recovery — mechanical guards (parsers-over-copy).
The parser cleans noisy backlog lines and resolves to canonical .tidal via the
catalog alias index. Test the load-bearing bits (leet fold, label strip, transition
tails) on synthetic lines, plus a coverage regression guard on the real backlog."""
import backlog_setlists as B
def test_leet_fold():
assert B.leet_fold(B.norm_name("Burn this B00K")) == "burnthisbook"
assert B.leet_fold(B.norm_name("Ghosts in the T01l3ts")) == "ghostsinthetoilets"
assert B.leet_fold(B.norm_name("So G00D")) == "sogood"
def test_clean_strips_labels_and_keeps_track():
# "Intro: X" / "Finale: X" → keep X, not the label
name, *_ , ok = B.clean_track_line("- Intro _🚪 Contre-Visite 👁️_")
assert ok and name == "Contre-Visite"
name, *_, ok = B.clean_track_line("- Finale: Lady Perplexity [138bpm]")
assert ok and name == "Lady Perplexity"
def test_clean_drops_gloss_after_colon():
# "Name: gloss" (lowercase gloss) → keep Name
name, *_, ok = B.clean_track_line("- Café tiède: au plus chaud")
assert ok and name.lower().startswith("café tiède")
def test_clean_captures_bpm_and_transition():
name, bpm, trans, ok = B.clean_track_line("- Quand on Décolle [120] {11+12->11}")
assert ok and bpm == 120 and trans == "11+12->11"
def test_clean_skips_ascii_and_comments():
assert B.clean_track_line("⣿-----⣿")[-1] is False
assert B.clean_track_line("<!-- - It's about Time -->")[-1] is False
def test_real_backlog_coverage_does_not_regress():
"""Guard: the cleanly-recovered gigs keep resolving. If the backlog or parser
changes and these drop, that's a regression to look at (not a silent loss)."""
out = B.build()
by = {**out["gigs"], **out["pending"]}
assert out["n_gigs"] >= 8
# opal-2024 is a clean, fully-resolvable block — it must stay at parity
assert by["2024/opal-festival-2024"]["n_resolved"] >= 13
# no recovered gig should be empty
for slug, g in out["gigs"].items():
assert g["n_tracks"] >= 1, slug
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment