feat(backlog_setlists): recover gig setlists from backlog.md → resolved .tidal

The site tracks.json covers only 23/37 gigs; the rest live in backlog.md as codename-keyed setlist blocks. New mechanical parser cleans the noisy lines (ASCII-art, emoji, emphasis, inline [bpm]/{transition} notes, commented-out tracks) and resolves names to canonical .tidal paths via the catalog's own alias index (DRY). Gig-codename→slug mapping is authored (confirm:-flagged when uncertain); suspect_bleed flags blocks that swallowed a neighbour list. 6 gigs cleanly recovered (52 track-slots); seam is larger (dozens of setlists).

feat(backlog_setlists): recover gig setlists from backlog.md → resolved .tidal
The site tracks.json covers only 23/37 gigs; the rest live in backlog.md as codename-keyed setlist blocks. New mechanical parser cleans the noisy lines (ASCII-art, emoji, emphasis, inline [bpm]/{transition} notes, commented-out tracks) and resolves names to canonical .tidal paths via the catalog's own alias index (DRY). Gig-codename→slug mapping is authored (confirm:-flagged when uncertain); suspect_bleed flags blocks that swallowed a neighbour list. 6 gigs cleanly recovered (52 track-slots); seam is larger (dozens of setlists).
5acf72f7 · PLN (Algolia) · 6815967f · 5acf72f7 · 5acf72f7
Commit 5acf72f7 authored Jun 06, 2026 by PLN (Algolia)
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 216 additions and 0 deletions

backlog_setlists.json armada/tide-table/backlog_setlists.json +0 -0

backlog_setlists.py armada/tide-table/backlog_setlists.py +216 -0

No files found.
--- a/armada/tide-table/backlog_setlists.json
+++ b/armada/tide-table/backlog_setlists.json
--- a/armada/tide-table/backlog_setlists.py
+++ b/armada/tide-table/backlog_setlists.py
+#!/usr/bin/env python3
+"""Recover gig setlists from backlog.md — the informal but canonical source PLN keeps.
+
+The site `tracks.json` only covers 23/37 gigs; the rest live in `backlog.md` as
+codename-keyed setlist blocks (e.g. `# VORTEX26`, `## Opal <3🦉`, `# Set @ENSAD 2025`).
+Those blocks are noisy: ASCII-art banners, emoji, markdown emphasis, inline `[bpm]` and
+`{transition-routing}` annotations, HTML-commented (dropped) tracks, section sub-headers.
+
+This parser is mechanical (parsers-over-copy): it extracts each anchored block, cleans
+the track lines, and RESOLVES names to canonical `.tidal` paths via the catalog's own
+alias index (DRY — same names that feed the triangle). Gig-codename→site-slug mapping is
+risky to guess, so it lives in an AUTHORED anchor table; uncertain anchors are flagged
+`confirm:` and excluded from the emitted artifact until PLN confirms.
+
+Emits `backlog_setlists.json`: per gig → resolved tracks (+ raw line, bpm, transition, section),
+unmatched names surfaced honestly. Never invents a track that doesn't resolve.
+"""
+import json
+import re
+import sys
+import unicodedata
+from pathlib import Path
+
+HERE = Path(__file__).parent
+BACKLOG = HERE.parent.parent / "backlog.md"
+CATALOG_VIEW = HERE / "catalog_view.json"
+OUT = HERE / "backlog_setlists.json"
+AS_OF = "2026-06-06"
+
+# ── Authored gig anchors: (1-based line of the header in backlog.md) → site slug ──
+# `confirm:`-prefixed slugs are NOT yet trusted (ambiguous header / needs PLN's ear);
+# they are parsed and reported but withheld from the emitted artifact.
+ANCHORS = {
+    1441: "2025/ensad",                    # "# Set @ENSAD 2025"
+    1816: "2026/le-vortex",                # "# VORTEX26"
+    1031: "2024/opal-festival-2024",       # "## Opal <3🦉" / OPAL 2024
+    1632: "2025/opal-festival-2025",       # "# OPAL 2025"
+    710:  "2023/mephisteuf",               # "## Live @MephisTeuf"
+    1194: "2024/38c3-house-of-tea",        # "#### Day 2 - HOUSE OF Tea" (under 38C3)
+    1242: "2024/38c3-toilet",              # "#### Day 3 - Hardcore in the Toilet Club"
+    1786: "2025/39c3-house-of-tea",        # "# CCC 39c3 ALGORAVE ... Pour Un Thé Dansant"
+    1169: "2024/toplap-solstice-2024",     # "### TopLap Solstice Stream 2024"
+    988:  "confirm:2024/algolia-fdlm",     # "## Algolia FDLM2024" — already has tracks.json
+    1550: "confirm:2025/algorave-lyon",    # "GZ 2025 LYON ALGORAVE" then "# LABENNE LIVE" (?)
+    1831: "confirm:2026/ete-surprise",     # "## SURPRISE IT'S A DANCEFLOOR" — slug guess
+}
+# Gigs still unanchored (no clear backlog block found): 2022/algolia-fdlm-2022,
+# 2024/38c3-chaos-music-club, 2025/39c3-toilet-rave, 2025/toplap-fromscratch-2025.
+
+# Lines that are decoration / structure, never tracks.
+_ASCII_ART = re.compile(r"[⠀-⣿▓▒░█▀▄■◆◢◣◤◥╔╗╚╝║═]")
+_SECTION_HINT = re.compile(
+    r"^(intro|outro|techno|dnb|nujazz|drill|drums|tempête|tempete|sommet|"
+    r"surprise|la\s|le\s|les\s|day\s|samedi|mission|rappel|bonus|final|"
+    r"pause|sunset|sunrise|darkness)\b", re.I)
+
+
+def deaccent(s: str) -> str:
+    return "".join(c for c in unicodedata.normalize("NFKD", s)
+                   if not unicodedata.combining(c))
+
+
+def norm_name(s: str) -> str:
+    """Normalize a track name for matching: deaccent, lowercase, alnum-only."""
+    return re.sub(r"[^a-z0-9]+", "", deaccent(s).lower())
+
+
+def build_name_index():
+    """norm(name) → .tidal path, from the catalog's existing alias set (DRY)."""
+    cv = json.load(open(CATALOG_VIEW))
+    idx = {}
+    for t in cv["tracks"]:
+        for nm in t["names"]:
+            idx.setdefault(norm_name(nm), t["track"])
+    return idx, cv
+
+
+def clean_track_line(raw: str):
+    """Return (name, bpm, transition, ok) from a raw list line, or ok=False to skip."""
+    s = raw.rstrip("\n")
+    if "<!--" in s or "-->" in s:           # commented-out track → dropped
+        return None, None, None, False
+    if _ASCII_ART.search(s):                # banner art
+        return None, None, None, False
+    m = re.match(r"\s*[-+*]+\s*(.+)$", s)   # must be a list item
+    if not m:
+        return None, None, None, False
+    body = m.group(1)
+    body = re.sub(r"<+/?3+", " ", body)      # <3 / <33 hearts → space (else leak a stray digit)
+    # pull inline annotations before stripping punctuation
+    bpm = None
+    bm = re.search(r"\[?\b(\d{2,3})\s*bpm", body, re.I) or re.search(r"\[(\d{2,3})(?:[-.]\d{2,3})*\]", body)
+    if bm:
+        bpm = int(bm.group(1))
+    transition = None
+    om = re.search(r"\{([^}]+)\}", body)
+    if om:
+        transition = om.group(1).strip()
+    # strip annotations, emphasis, emoji, trailing decoration
+    body = re.sub(r"\{[^}]*\}", "", body)
+    body = re.sub(r"\[[^\]]*\]", "", body)
+    body = re.sub(r"\([^)]*\)", "", body)            # (Chloe cover) etc
+    body = re.sub(r"\bfeat\b.*$", "", body, flags=re.I)
+    body = body.replace("*", "").replace("_", "").replace("`", "")
+    body = re.sub(r":\s*.*$", "", body)              # "Café tiède: note" → drop gloss after colon
+    # remove emoji / symbol chars, keep letters/digits/space/'-/&
+    body = "".join(ch for ch in body
+                   if ch.isalnum() or ch in " '-/&." or unicodedata.category(ch).startswith("L"))
+    # dash/arrow transition tail (ENSAD style "Quand on décolle 0 - / 11")
+    tt = _TRANS_TAIL.search(body)
+    if tt and tt.group(0).strip():
+        if not transition:
+            transition = tt.group(0).strip(" -–")
+        body = body[:tt.start()]
+    body = re.sub(r"\s+", " ", body).strip(" .-")
+    if len(body) < 2:
+        return None, None, None, False
+    return body, bpm, transition, True
+
+
+def block_span(lines, start_idx):
+    """Collect a gig block. Headers in backlog.md are DECORATIVE (stylized banners,
+    repeated `# PARVAGUES`, sub-labels) not hierarchical, so we don't scope by level.
+    Instead: from the anchor, walk to the next ANCHOR, but terminate early once the
+    setlist proper ends — a run of ≥3 blank lines after we've collected ≥2 items
+    (journal prose resumes). Decorative headers are kept as section context, not breaks."""
+    anchor_lines = sorted(k - 1 for k in ANCHORS)
+    hard_end = next((a for a in anchor_lines if a > start_idx), len(lines))
+    out, items, blanks = [], 0, 0
+    for raw in lines[start_idx + 1:hard_end]:
+        if not raw.strip():
+            blanks += 1
+            if blanks >= 3 and items >= 2:
+                break
+            continue
+        blanks = 0
+        if re.match(r"\s*[-+*]+\s*\S", raw) and not _ASCII_ART.search(raw):
+            items += 1
+        out.append(raw)
+    return out
+
+
+# transition tails appended to a track name: "-> 5+7", " 0 - / 11", "- 9"
+_TRANS_TAIL = re.compile(r"\s*(->|[-–])\s*[\d\s./+,]*$|\s+[\d][\d\s./+,>-]*$")
+
+
+def parse(lines, name_idx):
+    gigs = {}
+    for ln1, slug in sorted(ANCHORS.items()):
+        confirm = slug.startswith("confirm:")
+        real_slug = slug.split(":", 1)[1] if confirm else slug
+        i = ln1 - 1
+        header = lines[i].strip()
+        section = None
+        tracks, unmatched = [], []
+        for raw in block_span(lines, i):
+            hm = re.match(r"\s*#{1,6}\s+(.+)$", raw)
+            if hm:                                          # subsection label
+                section = re.sub(r"[^\w\s'-]", "", deaccent(hm.group(1))).strip() or None
+                continue
+            name, bpm, transition, ok = clean_track_line(raw)
+            if not ok:
+                continue
+            if _SECTION_HINT.match(name) and norm_name(name) not in name_idx:
+                continue
+            track = name_idx.get(norm_name(name))
+            entry = {"raw": name, "track": track, "bpm": bpm, "transition": transition, "section": section}
+            (tracks if track else unmatched).append(entry)
+            if not track:
+                tracks.append(entry)                        # keep, but track=None
+        gigs[real_slug] = {
+            "slug": real_slug, "confirm": confirm, "header": header, "anchor_line": ln1,
+            "n_tracks": len(tracks),
+            "n_resolved": sum(1 for t in tracks if t["track"]),
+            # >20 items between anchors → almost certainly swallowed a neighbour setlist
+            # (backlog has many densely-packed lists; needs a tighter anchor). Flag, don't hide.
+            "suspect_bleed": len(tracks) > 20,
+            "tracks": tracks,
+        }
+    return gigs
+
+
+def build():
+    name_idx, _ = build_name_index()
+    lines = BACKLOG.read_text().splitlines()
+    gigs = parse(lines, name_idx)
+    confirmed = {k: v for k, v in gigs.items() if not v["confirm"]}
+    pending = {k: v for k, v in gigs.items() if v["confirm"]}
+    return {
+        "schema": "backlog setlists (recovered gig→tracks; resolved to .tidal)",
+        "as_of": AS_OF, "source": "backlog.md",
+        "n_gigs": len(confirmed), "n_pending_confirm": len(pending),
+        "gigs": confirmed, "pending": pending,
+    }
+
+
+def main():
+    out = build()
+    OUT.write_text(json.dumps(out, ensure_ascii=False, indent=1))
+    print(f"✓ {OUT}")
+    print(f"  {out['n_gigs']} gigs recovered, {out['n_pending_confirm']} pending confirm\n")
+    for grp, label in ((out["gigs"], "RECOVERED"), (out["pending"], "PENDING-CONFIRM")):
+        for slug, g in sorted(grp.items()):
+            print(f"  [{label}] {slug}  «{g['header'][:48]}»  "
+                  f"{g['n_resolved']}/{g['n_tracks']} resolved")
+            for t in g["tracks"]:
+                mark = "✓" if t["track"] else "✗"
+                extra = " ".join(x for x in (f"{t['bpm']}bpm" if t['bpm'] else "",
+                                             f"{{{t['transition']}}}" if t['transition'] else "") if x)
+                tgt = Path(t["track"]).stem if t["track"] else "— UNMATCHED —"
+                print(f"      {mark} {t['raw']:<32} → {tgt}  {extra}")
+            print()
+
+
+if __name__ == "__main__":
+    main()