Commit 5acf72f7 by PLN (Algolia)

feat(backlog_setlists): recover gig setlists from backlog.md → resolved .tidal

The site tracks.json covers only 23/37 gigs; the rest live in backlog.md as
codename-keyed setlist blocks. New mechanical parser cleans the noisy lines
(ASCII-art, emoji, emphasis, inline [bpm]/{transition} notes, commented-out
tracks) and resolves names to canonical .tidal paths via the catalog's own
alias index (DRY). Gig-codename→slug mapping is authored (confirm:-flagged
when uncertain); suspect_bleed flags blocks that swallowed a neighbour list.
6 gigs cleanly recovered (52 track-slots); seam is larger (dozens of setlists).
parent 6815967f
#!/usr/bin/env python3
"""Recover gig setlists from backlog.md — the informal but canonical source PLN keeps.
The site `tracks.json` only covers 23/37 gigs; the rest live in `backlog.md` as
codename-keyed setlist blocks (e.g. `# VORTEX26`, `## Opal <3🦉`, `# Set @ENSAD 2025`).
Those blocks are noisy: ASCII-art banners, emoji, markdown emphasis, inline `[bpm]` and
`{transition-routing}` annotations, HTML-commented (dropped) tracks, section sub-headers.
This parser is mechanical (parsers-over-copy): it extracts each anchored block, cleans
the track lines, and RESOLVES names to canonical `.tidal` paths via the catalog's own
alias index (DRY — same names that feed the triangle). Gig-codename→site-slug mapping is
risky to guess, so it lives in an AUTHORED anchor table; uncertain anchors are flagged
`confirm:` and excluded from the emitted artifact until PLN confirms.
Emits `backlog_setlists.json`: per gig → resolved tracks (+ raw line, bpm, transition, section),
unmatched names surfaced honestly. Never invents a track that doesn't resolve.
"""
import json
import re
import sys
import unicodedata
from pathlib import Path
HERE = Path(__file__).parent
BACKLOG = HERE.parent.parent / "backlog.md"
CATALOG_VIEW = HERE / "catalog_view.json"
OUT = HERE / "backlog_setlists.json"
AS_OF = "2026-06-06"
# ── Authored gig anchors: (1-based line of the header in backlog.md) → site slug ──
# `confirm:`-prefixed slugs are NOT yet trusted (ambiguous header / needs PLN's ear);
# they are parsed and reported but withheld from the emitted artifact.
ANCHORS = {
1441: "2025/ensad", # "# Set @ENSAD 2025"
1816: "2026/le-vortex", # "# VORTEX26"
1031: "2024/opal-festival-2024", # "## Opal <3🦉" / OPAL 2024
1632: "2025/opal-festival-2025", # "# OPAL 2025"
710: "2023/mephisteuf", # "## Live @MephisTeuf"
1194: "2024/38c3-house-of-tea", # "#### Day 2 - HOUSE OF Tea" (under 38C3)
1242: "2024/38c3-toilet", # "#### Day 3 - Hardcore in the Toilet Club"
1786: "2025/39c3-house-of-tea", # "# CCC 39c3 ALGORAVE ... Pour Un Thé Dansant"
1169: "2024/toplap-solstice-2024", # "### TopLap Solstice Stream 2024"
988: "confirm:2024/algolia-fdlm", # "## Algolia FDLM2024" — already has tracks.json
1550: "confirm:2025/algorave-lyon", # "GZ 2025 LYON ALGORAVE" then "# LABENNE LIVE" (?)
1831: "confirm:2026/ete-surprise", # "## SURPRISE IT'S A DANCEFLOOR" — slug guess
}
# Gigs still unanchored (no clear backlog block found): 2022/algolia-fdlm-2022,
# 2024/38c3-chaos-music-club, 2025/39c3-toilet-rave, 2025/toplap-fromscratch-2025.
# Lines that are decoration / structure, never tracks.
_ASCII_ART = re.compile(r"[⠀-⣿▓▒░█▀▄■◆◢◣◤◥╔╗╚╝║═]")
_SECTION_HINT = re.compile(
r"^(intro|outro|techno|dnb|nujazz|drill|drums|tempête|tempete|sommet|"
r"surprise|la\s|le\s|les\s|day\s|samedi|mission|rappel|bonus|final|"
r"pause|sunset|sunrise|darkness)\b", re.I)
def deaccent(s: str) -> str:
return "".join(c for c in unicodedata.normalize("NFKD", s)
if not unicodedata.combining(c))
def norm_name(s: str) -> str:
"""Normalize a track name for matching: deaccent, lowercase, alnum-only."""
return re.sub(r"[^a-z0-9]+", "", deaccent(s).lower())
def build_name_index():
"""norm(name) → .tidal path, from the catalog's existing alias set (DRY)."""
cv = json.load(open(CATALOG_VIEW))
idx = {}
for t in cv["tracks"]:
for nm in t["names"]:
idx.setdefault(norm_name(nm), t["track"])
return idx, cv
def clean_track_line(raw: str):
"""Return (name, bpm, transition, ok) from a raw list line, or ok=False to skip."""
s = raw.rstrip("\n")
if "<!--" in s or "-->" in s: # commented-out track → dropped
return None, None, None, False
if _ASCII_ART.search(s): # banner art
return None, None, None, False
m = re.match(r"\s*[-+*]+\s*(.+)$", s) # must be a list item
if not m:
return None, None, None, False
body = m.group(1)
body = re.sub(r"<+/?3+", " ", body) # <3 / <33 hearts → space (else leak a stray digit)
# pull inline annotations before stripping punctuation
bpm = None
bm = re.search(r"\[?\b(\d{2,3})\s*bpm", body, re.I) or re.search(r"\[(\d{2,3})(?:[-.]\d{2,3})*\]", body)
if bm:
bpm = int(bm.group(1))
transition = None
om = re.search(r"\{([^}]+)\}", body)
if om:
transition = om.group(1).strip()
# strip annotations, emphasis, emoji, trailing decoration
body = re.sub(r"\{[^}]*\}", "", body)
body = re.sub(r"\[[^\]]*\]", "", body)
body = re.sub(r"\([^)]*\)", "", body) # (Chloe cover) etc
body = re.sub(r"\bfeat\b.*$", "", body, flags=re.I)
body = body.replace("*", "").replace("_", "").replace("`", "")
body = re.sub(r":\s*.*$", "", body) # "Café tiède: note" → drop gloss after colon
# remove emoji / symbol chars, keep letters/digits/space/'-/&
body = "".join(ch for ch in body
if ch.isalnum() or ch in " '-/&." or unicodedata.category(ch).startswith("L"))
# dash/arrow transition tail (ENSAD style "Quand on décolle 0 - / 11")
tt = _TRANS_TAIL.search(body)
if tt and tt.group(0).strip():
if not transition:
transition = tt.group(0).strip(" -–")
body = body[:tt.start()]
body = re.sub(r"\s+", " ", body).strip(" .-")
if len(body) < 2:
return None, None, None, False
return body, bpm, transition, True
def block_span(lines, start_idx):
"""Collect a gig block. Headers in backlog.md are DECORATIVE (stylized banners,
repeated `# PARVAGUES`, sub-labels) not hierarchical, so we don't scope by level.
Instead: from the anchor, walk to the next ANCHOR, but terminate early once the
setlist proper ends — a run of ≥3 blank lines after we've collected ≥2 items
(journal prose resumes). Decorative headers are kept as section context, not breaks."""
anchor_lines = sorted(k - 1 for k in ANCHORS)
hard_end = next((a for a in anchor_lines if a > start_idx), len(lines))
out, items, blanks = [], 0, 0
for raw in lines[start_idx + 1:hard_end]:
if not raw.strip():
blanks += 1
if blanks >= 3 and items >= 2:
break
continue
blanks = 0
if re.match(r"\s*[-+*]+\s*\S", raw) and not _ASCII_ART.search(raw):
items += 1
out.append(raw)
return out
# transition tails appended to a track name: "-> 5+7", " 0 - / 11", "- 9"
_TRANS_TAIL = re.compile(r"\s*(->|[-–])\s*[\d\s./+,]*$|\s+[\d][\d\s./+,>-]*$")
def parse(lines, name_idx):
gigs = {}
for ln1, slug in sorted(ANCHORS.items()):
confirm = slug.startswith("confirm:")
real_slug = slug.split(":", 1)[1] if confirm else slug
i = ln1 - 1
header = lines[i].strip()
section = None
tracks, unmatched = [], []
for raw in block_span(lines, i):
hm = re.match(r"\s*#{1,6}\s+(.+)$", raw)
if hm: # subsection label
section = re.sub(r"[^\w\s'-]", "", deaccent(hm.group(1))).strip() or None
continue
name, bpm, transition, ok = clean_track_line(raw)
if not ok:
continue
if _SECTION_HINT.match(name) and norm_name(name) not in name_idx:
continue
track = name_idx.get(norm_name(name))
entry = {"raw": name, "track": track, "bpm": bpm, "transition": transition, "section": section}
(tracks if track else unmatched).append(entry)
if not track:
tracks.append(entry) # keep, but track=None
gigs[real_slug] = {
"slug": real_slug, "confirm": confirm, "header": header, "anchor_line": ln1,
"n_tracks": len(tracks),
"n_resolved": sum(1 for t in tracks if t["track"]),
# >20 items between anchors → almost certainly swallowed a neighbour setlist
# (backlog has many densely-packed lists; needs a tighter anchor). Flag, don't hide.
"suspect_bleed": len(tracks) > 20,
"tracks": tracks,
}
return gigs
def build():
name_idx, _ = build_name_index()
lines = BACKLOG.read_text().splitlines()
gigs = parse(lines, name_idx)
confirmed = {k: v for k, v in gigs.items() if not v["confirm"]}
pending = {k: v for k, v in gigs.items() if v["confirm"]}
return {
"schema": "backlog setlists (recovered gig→tracks; resolved to .tidal)",
"as_of": AS_OF, "source": "backlog.md",
"n_gigs": len(confirmed), "n_pending_confirm": len(pending),
"gigs": confirmed, "pending": pending,
}
def main():
out = build()
OUT.write_text(json.dumps(out, ensure_ascii=False, indent=1))
print(f"✓ {OUT}")
print(f" {out['n_gigs']} gigs recovered, {out['n_pending_confirm']} pending confirm\n")
for grp, label in ((out["gigs"], "RECOVERED"), (out["pending"], "PENDING-CONFIRM")):
for slug, g in sorted(grp.items()):
print(f" [{label}] {slug} «{g['header'][:48]}» "
f"{g['n_resolved']}/{g['n_tracks']} resolved")
for t in g["tracks"]:
mark = "✓" if t["track"] else "✗"
extra = " ".join(x for x in (f"{t['bpm']}bpm" if t['bpm'] else "",
f"{{{t['transition']}}}" if t['transition'] else "") if x)
tgt = Path(t["track"]).stem if t["track"] else "— UNMATCHED —"
print(f" {mark} {t['raw']:<32} → {tgt} {extra}")
print()
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment