feat(semantics): validated CLAP vibe-search + live /vibe endpoint (#82,#86)

Katana-first finding: per-one-shot CLAP genre/mood tags are unreliable (every hit → boom-bap/euphoric — a 0.3s sound has no genre), but the audio EMBEDDINGS are gold for RELATIVE similarity. 'warm dusty rhodes' → suns_keys gold-keys + west-coast electric; 'jazzy upright bass' → no_sunshine/come_bass loops; a kick's nearest neighbours are other kicks (0.96 cross-folder). So we ship similarity, not fake absolute labels (Principle 1: trust the instrument). - sample_semantics.py validated on real audio; semantics_embeds.npz = 1490×512-d. - serve.py: lazy CLAP /vibe?q= (embed any phrase → rank) + /similar?name= (by audio-embed cosine). 503 if unbuilt, 400/404 on bad input; static serving untouched. Single-user LAN, torch loads once on first hit.

feat(semantics): validated CLAP vibe-search + live /vibe endpoint (#82,#86)
Katana-first finding: per-one-shot CLAP genre/mood tags are unreliable (every hit → boom-bap/euphoric — a 0.3s sound has no genre), but the audio EMBEDDINGS are gold for RELATIVE similarity. 'warm dusty rhodes' → suns_keys gold-keys + west-coast electric; 'jazzy upright bass' → no_sunshine/come_bass loops; a kick's nearest neighbours are other kicks (0.96 cross-folder). So we ship similarity, not fake absolute labels (Principle 1: trust the instrument). - sample_semantics.py validated on real audio; semantics_embeds.npz = 1490×512-d. - serve.py: lazy CLAP /vibe?q= (embed any phrase → rank) + /similar?name= (by audio-embed cosine). 503 if unbuilt, 400/404 on bad input; static serving untouched. Single-user LAN, torch loads once on first hit.
ac4643d0 · PLN (Algolia) · 7981dd07 · ac4643d0 · ac4643d0 · ac4643d0
Commit ac4643d0 authored Jun 07, 2026 by PLN (Algolia)
Hide whitespace changes
Inline Side-by-side

Showing with 301 additions and 1 deletion

serve.py armada/serve.py +81 -1

sample_semantics.py armada/tide-table/sample_semantics.py +220 -0

semantics_embeds.npz armada/tide-table/semantics_embeds.npz +0 -0

No files found.
--- a/armada/serve.py
+++ b/armada/serve.py
@@ -7,12 +7,92 @@ the printed LAN URL on a phone on the same wifi.

  python3 serve.py --dir tide-table/punkachien --port 8731
 """
-import argparse, os, re, socket
+import argparse, json, os, re, socket, sys, threading
 from functools import partial
 from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+from urllib.parse import urlparse, parse_qs
+
+# ── optional CLAP vibe-search API (lazy: torch loads on first /vibe hit) ───────
+# Serves /vibe?q=<phrase> and /similar?name=folder/stem over the cached sample
+# embeddings (semantics_embeds.npz in --dir). Absent/unbuilt → 503 with a hint,
+# never breaks static serving. See sample_semantics.py.
+_VIBE = {}
+_VIBE_LOCK = threading.Lock()
+
+
+def _vibe_load():
+    """Lazy-load CLAP + the cached embed table once. Returns state or raises."""
+    if "ready" in _VIBE:
+        return _VIBE
+    with _VIBE_LOCK:
+        if "ready" in _VIBE:
+            return _VIBE
+        sys.path.insert(0, os.getcwd())            # --dir (tide-table) for imports
+        import numpy as np
+        import sample_semantics as S
+        if not S.EMBEDS.exists():
+            raise FileNotFoundError(f"{S.EMBEDS.name} not built — run "
+                                    "`python3 sample_semantics.py embed`")
+        z = np.load(S.EMBEDS, allow_pickle=True)
+        _VIBE.update(S=S, np=np, M=z["embeds"].astype("float32"),
+                     names=[str(x) for x in z["names"]],
+                     idx={str(n): i for i, n in enumerate(z["names"])})
+        _VIBE["ready"] = True
+    return _VIBE
+
+
+def _vibe_rank(qvec, n, drop=-1):
+    V = _VIBE
+    sims = V["M"] @ qvec
+    order = V["np"].argsort(-sims)
+    out = []
+    for i in order:
+        if i == drop:
+            continue
+        out.append({"name": V["names"][i], "sim": round(float(sims[i]), 4)})
+        if len(out) >= n:
+            break
+    return out


 class RangeHandler(SimpleHTTPRequestHandler):
+    def do_GET(self):
+        if self.path.split("?", 1)[0] in ("/vibe", "/similar"):
+            return self._api()
+        return super().do_GET()
+
+    def _api(self):
+        u = urlparse(self.path)
+        q = parse_qs(u.query)
+        n = int(q.get("n", ["24"])[0])
+        try:
+            V = _vibe_load()
+        except Exception as e:
+            return self._json({"error": str(e)}, 503)
+        try:
+            if u.path == "/vibe":
+                phrase = (q.get("q", [""])[0]).strip()
+                if not phrase:
+                    return self._json({"error": "empty query"}, 400)
+                qe = V["S"]._embed_texts([phrase]).cpu().numpy()[0]
+                return self._json({"query": phrase, "results": _vibe_rank(qe, n)})
+            else:  # /similar
+                name = q.get("name", [""])[0]
+                if name not in V["idx"]:
+                    return self._json({"error": f"unknown sample {name!r}"}, 404)
+                i = V["idx"][name]
+                return self._json({"of": name, "results": _vibe_rank(V["M"][i], n, drop=i)})
+        except Exception as e:
+            return self._json({"error": repr(e)}, 500)
+
+    def _json(self, obj, code=200):
+        body = json.dumps(obj).encode()
+        self.send_response(code)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
    def end_headers(self):
        self.send_header("Accept-Ranges", "bytes")
        self.send_header("Cache-Control", "no-cache")

--- a/armada/tide-table/sample_semantics.py
+++ b/armada/tide-table/sample_semantics.py
+#!/usr/bin/env python3
+"""sample_semantics — the L2 'tie-the-knot' engine: open-vocabulary tags + vibe search.
+
+When PLN reaches for a sound it's never "give me family=keys" — it's "a LUSH keyboard",
+"some VOICE CHOIRS", "a SOMBRE ténébreux synth", "BRASS long samples", "let's go NUJAZZ",
+"californian G-FUNK". Those words live on several independent axes at once (instrument ×
+timbre × genre-scene × mood), and CLAP is the bridge: every phrase is BOTH a label to
+score a sample against AND a query to search the library with — same embedding space,
+both directions.
+
+This module scores each sample on a rich MULTI-AXIS descriptor ontology (one softmax per
+axis → top tags + confidence per axis), and runs free-text vibe search ("find me a
+warm dusty rhodes") by embedding the query and ranking samples by cosine similarity. The
+ontology is meant to grow — add the words you actually use; FR + EN both welcome. CLAP
+proposes, PLN's ear confirms ([[reference_audio_feature_stack]], [[feedback_sample_identity]]).
+
+    python3 sample_semantics.py tags <folder|file>          # multi-axis tags for a sound
+    python3 sample_semantics.py search "lush warm rhodes" [--all] [--limit N]
+    python3 sample_semantics.py embed [--all] [--limit N]   # cache audio embeds → semantics_embeds.npz
+
+Reuses the validated CLAP primitives from sample_classify (text tower encoded once,
+audio via get_audio_features().pooler_output, normalized — identical to a full forward).
+"""
+import json
+import sys
+from pathlib import Path
+
+import numpy as np
+
+import sample_classify as CLF
+
+HERE = Path(__file__).resolve().parent
+EMBEDS = HERE / "semantics_embeds.npz"      # cached per-file audio embeddings (vibe search)
+OUT = HERE / "sample_semantics.json"
+
+# ── the multi-axis descriptor ontology ────────────────────────────────────────
+# Each axis is scored INDEPENDENTLY (own softmax) — a sound is e.g. instrument=rhodes,
+# timbre=warm+lush, scene=nu-jazz, mood=dreamy all at once. Phrases drop into the
+# "this is the sound of {phrase}" frame CLAP was trained on. PLN's own words are folded
+# in verbatim where they're vivid (lush, ténébreux, g-funk).
+AXES = {
+    "instrument": [
+        "a kick drum", "a snare drum", "a hi-hat", "hand percussion", "a drum break",
+        "a sub bass", "an acid bassline", "a grand piano", "a rhodes electric piano",
+        "a hammond organ", "an electric guitar", "a synth lead", "a synth pad",
+        "a string ensemble", "a brass section", "a saxophone", "a flute",
+        "a choir of voices", "a solo singing voice", "a spoken voice", "a bell or mallet",
+        "a plucked string instrument", "a sitar or eastern string instrument",
+    ],
+    "timbre": [
+        "a warm sound", "a dark sound", "a bright sparkling sound", "a lush rich sound",
+        "a gritty distorted sound", "a smooth clean sound", "a harsh aggressive sound",
+        "a dusty lo-fi sound", "a glassy crystalline sound", "a woody organic sound",
+        "a metallic sound", "an airy breathy sound", "a fat thick sound",
+        "a thin sound", "a shadowy ténébreux sound", "a dreamy washed-out sound",
+        "a punchy tight sound", "a muddy boomy sound",
+    ],
+    "scene": [
+        "nu jazz", "jazz", "californian g-funk", "boom bap hip hop", "trap",
+        "deep house", "techno", "acid", "jungle drum and bass", "dub reggae",
+        "ambient", "disco funk", "afrobeat", "indian classical raga", "latin",
+        "cinematic orchestral", "soul and motown", "chiptune", "garage",
+    ],
+    "mood": [
+        "sombre and melancholic", "euphoric and uplifting", "dreamy and floating",
+        "aggressive and intense", "playful and quirky", "tense and suspenseful",
+        "hypnotic and driving", "warm and nostalgic", "cold and clinical",
+        "epic and triumphant", "intimate and tender", "dark and brooding",
+    ],
+}
+
+
+# ── CLAP primitives (own text embeds per axis; shares the model load) ──────────
+_S = {}
+
+
+def _embed_texts(prompts):
+    """Normalized CLAP text embeddings for a list of phrases (uses sample_classify's
+    loaded model; .pooler_output is the projected joint embedding — verified)."""
+    C = CLF._clap()                                       # ensures model is loaded
+    torch = C["torch"]
+    proc, model = C["proc"], C["model"]
+    ti = proc(text=["this is the sound of " + p for p in prompts],
+              return_tensors="pt", padding=True)
+    with torch.no_grad():
+        te = model.get_text_features(input_ids=ti["input_ids"],
+                                     attention_mask=ti["attention_mask"]).pooler_output
+        te = te / te.norm(p=2, dim=-1, keepdim=True)
+    return te
+
+
+def _axis_embeds():
+    """Cache: {axis: (labels, text_embed_tensor)} for every axis."""
+    if "axes" not in _S:
+        _S["axes"] = {ax: (labels, _embed_texts(labels)) for ax, labels in AXES.items()}
+    return _S["axes"]
+
+
+def embed_audios(paths):
+    """Normalized CLAP audio embeddings for paths → (np.ndarray (n×512), kept_paths).
+    Decodes via sample_classify.load_audio (48k mono, thread-pooled)."""
+    from concurrent.futures import ThreadPoolExecutor
+    with ThreadPoolExecutor(max_workers=4) as ex:
+        decoded = list(ex.map(CLF.load_audio, paths))
+    keep = [(p, a) for p, a in zip(paths, decoded) if a is not None]
+    if not keep:
+        return np.zeros((0, 512), dtype=np.float32), []
+    C = CLF._clap()
+    torch = C["torch"]
+    with torch.no_grad():
+        ai = C["proc"](audio=[a for _, a in keep], sampling_rate=CLF.SR,
+                       return_tensors="pt", padding=True)
+        ae = C["model"].get_audio_features(input_features=ai["input_features"]).pooler_output
+        ae = ae / ae.norm(p=2, dim=-1, keepdim=True)
+    return ae.cpu().numpy(), [p for p, _ in keep]
+
+
+# ── tagging: multi-axis labels for one sound ──────────────────────────────────
+def tag_audio_embed(ae_row, topk=3):
+    """One normalized audio-embed row → {axis: [(label, prob)...topk]}."""
+    C = CLF._clap()
+    torch = C["torch"]
+    ten = torch.tensor(ae_row).unsqueeze(0)
+    scale = C["logit_scale"]
+    out = {}
+    for ax, (labels, te) in _axis_embeds().items():
+        logits = (scale * ten @ te.t()).squeeze(0)
+        probs = torch.softmax(logits, dim=-1)
+        top = sorted(zip(labels, probs.tolist()), key=lambda t: -t[1])[:topk]
+        out[ax] = [(lab, round(p, 3)) for lab, p in top]
+    return out
+
+
+def tag_folder(name, topk=3):
+    files = CLF.folder_files(name)
+    if not files:
+        return None
+    ae, kept = embed_audios(files[:CLF.MAX_FILES])
+    if not len(ae):
+        return None
+    mean = ae.mean(0, keepdims=True)
+    mean = mean / (np.linalg.norm(mean) + 1e-9)
+    return tag_audio_embed(mean[0], topk=topk)
+
+
+# ── vibe search: free-text query → ranked samples ─────────────────────────────
+def vibe_search(query, topn=15):
+    if not EMBEDS.exists():
+        sys.exit(f"no {EMBEDS.name} — run `python3 sample_semantics.py embed --all` first.")
+    z = np.load(EMBEDS, allow_pickle=True)
+    M, names = z["embeds"], z["names"]
+    qe = _embed_texts([query]).cpu().numpy()[0]
+    sims = M @ qe
+    order = np.argsort(-sims)[:topn]
+    return [(str(names[i]), round(float(sims[i]), 3)) for i in order]
+
+
+# ── commands ──────────────────────────────────────────────────────────────────
+def cmd_tags(target):
+    p = Path(target)
+    if p.is_file():
+        ae, _ = embed_audios([p])
+        if not len(ae):
+            sys.exit("decode failed")
+        tags = tag_audio_embed(ae[0])
+    else:
+        tags = tag_folder(target)
+        if not tags:
+            sys.exit(f"no folder/files for {target!r}")
+    print(f"🏷️  {target}")
+    for ax, items in tags.items():
+        print(f"  {ax:<11} " + ", ".join(f"{lab} ({p:.2f})" for lab, p in items))
+
+
+def cmd_embed(all_folders=False, limit=None):
+    folders = (sorted(d.name for d in CLF.DIRT.iterdir() if d.is_dir())
+               if all_folders else [s for s in CLF.corpus_sounds() if CLF.folder_files(s)])
+    if limit:
+        folders = folders[:limit]
+    print(f"⛵ embedding samples from {len(folders)} folders for vibe search…")
+    embs, names = [], []
+    for i, name in enumerate(folders, 1):
+        files = CLF.folder_files(name)
+        if not files:
+            continue
+        ae, kept = embed_audios(files[:CLF.MAX_FILES])
+        for row, path in zip(ae, kept):
+            embs.append(row)
+            names.append(f"{name}/{Path(path).stem}")
+        if i % 10 == 0:
+            print(f"  [{i}/{len(folders)}] {len(names)} samples embedded", flush=True)
+    M = np.array(embs, dtype=np.float32)
+    np.savez(EMBEDS, embeds=M, names=np.array(names, dtype=object))
+    print(f"✓ {EMBEDS.name}: {M.shape[0]} samples × {M.shape[1]}-d")
+
+
+def cmd_search(query, limit=15):
+    print(f"🔎 vibe search: “{query}”\n")
+    for name, sim in vibe_search(query, topn=limit):
+        print(f"  {sim:+.3f}  {name}")
+
+
+def main():
+    args = sys.argv[1:]
+    cmd = args[0] if args else None
+    if cmd == "tags" and len(args) > 1:
+        cmd_tags(args[1])
+    elif cmd == "embed":
+        cmd_embed(all_folders="--all" in args,
+                  limit=int(args[args.index("--limit") + 1]) if "--limit" in args else None)
+    elif cmd == "search" and len(args) > 1:
+        cmd_search(args[1],
+                   limit=int(args[args.index("--limit") + 1]) if "--limit" in args else 15)
+    else:
+        sys.exit('usage: sample_semantics.py [tags <folder|file> | '
+                 'embed [--all] [--limit N] | search "<query>" [--limit N]]')
+
+
+if __name__ == "__main__":
+    main()
--- a/armada/tide-table/semantics_embeds.npz
+++ b/armada/tide-table/semantics_embeds.npz