Commit ac4643d0 by PLN (Algolia)

feat(semantics): validated CLAP vibe-search + live /vibe endpoint (#82,#86)

Katana-first finding: per-one-shot CLAP genre/mood tags are unreliable (every
hit → boom-bap/euphoric — a 0.3s sound has no genre), but the audio EMBEDDINGS
are gold for RELATIVE similarity. 'warm dusty rhodes' → suns_keys gold-keys +
west-coast electric; 'jazzy upright bass' → no_sunshine/come_bass loops; a kick's
nearest neighbours are other kicks (0.96 cross-folder). So we ship similarity,
not fake absolute labels (Principle 1: trust the instrument).

- sample_semantics.py validated on real audio; semantics_embeds.npz = 1490×512-d.
- serve.py: lazy CLAP /vibe?q= (embed any phrase → rank) + /similar?name= (by
  audio-embed cosine). 503 if unbuilt, 400/404 on bad input; static serving
  untouched. Single-user LAN, torch loads once on first hit.
parent 7981dd07
......@@ -7,12 +7,92 @@ the printed LAN URL on a phone on the same wifi.
python3 serve.py --dir tide-table/punkachien --port 8731
"""
import argparse, os, re, socket
import argparse, json, os, re, socket, sys, threading
from functools import partial
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
from urllib.parse import urlparse, parse_qs
# ── optional CLAP vibe-search API (lazy: torch loads on first /vibe hit) ───────
# Serves /vibe?q=<phrase> and /similar?name=folder/stem over the cached sample
# embeddings (semantics_embeds.npz in --dir). Absent/unbuilt → 503 with a hint,
# never breaks static serving. See sample_semantics.py.
_VIBE = {}
_VIBE_LOCK = threading.Lock()
def _vibe_load():
"""Lazy-load CLAP + the cached embed table once. Returns state or raises."""
if "ready" in _VIBE:
return _VIBE
with _VIBE_LOCK:
if "ready" in _VIBE:
return _VIBE
sys.path.insert(0, os.getcwd()) # --dir (tide-table) for imports
import numpy as np
import sample_semantics as S
if not S.EMBEDS.exists():
raise FileNotFoundError(f"{S.EMBEDS.name} not built — run "
"`python3 sample_semantics.py embed`")
z = np.load(S.EMBEDS, allow_pickle=True)
_VIBE.update(S=S, np=np, M=z["embeds"].astype("float32"),
names=[str(x) for x in z["names"]],
idx={str(n): i for i, n in enumerate(z["names"])})
_VIBE["ready"] = True
return _VIBE
def _vibe_rank(qvec, n, drop=-1):
V = _VIBE
sims = V["M"] @ qvec
order = V["np"].argsort(-sims)
out = []
for i in order:
if i == drop:
continue
out.append({"name": V["names"][i], "sim": round(float(sims[i]), 4)})
if len(out) >= n:
break
return out
class RangeHandler(SimpleHTTPRequestHandler):
def do_GET(self):
if self.path.split("?", 1)[0] in ("/vibe", "/similar"):
return self._api()
return super().do_GET()
def _api(self):
u = urlparse(self.path)
q = parse_qs(u.query)
n = int(q.get("n", ["24"])[0])
try:
V = _vibe_load()
except Exception as e:
return self._json({"error": str(e)}, 503)
try:
if u.path == "/vibe":
phrase = (q.get("q", [""])[0]).strip()
if not phrase:
return self._json({"error": "empty query"}, 400)
qe = V["S"]._embed_texts([phrase]).cpu().numpy()[0]
return self._json({"query": phrase, "results": _vibe_rank(qe, n)})
else: # /similar
name = q.get("name", [""])[0]
if name not in V["idx"]:
return self._json({"error": f"unknown sample {name!r}"}, 404)
i = V["idx"][name]
return self._json({"of": name, "results": _vibe_rank(V["M"][i], n, drop=i)})
except Exception as e:
return self._json({"error": repr(e)}, 500)
def _json(self, obj, code=200):
body = json.dumps(obj).encode()
self.send_response(code)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def end_headers(self):
self.send_header("Accept-Ranges", "bytes")
self.send_header("Cache-Control", "no-cache")
......
#!/usr/bin/env python3
"""sample_semantics — the L2 'tie-the-knot' engine: open-vocabulary tags + vibe search.
When PLN reaches for a sound it's never "give me family=keys" — it's "a LUSH keyboard",
"some VOICE CHOIRS", "a SOMBRE ténébreux synth", "BRASS long samples", "let's go NUJAZZ",
"californian G-FUNK". Those words live on several independent axes at once (instrument ×
timbre × genre-scene × mood), and CLAP is the bridge: every phrase is BOTH a label to
score a sample against AND a query to search the library with — same embedding space,
both directions.
This module scores each sample on a rich MULTI-AXIS descriptor ontology (one softmax per
axis → top tags + confidence per axis), and runs free-text vibe search ("find me a
warm dusty rhodes") by embedding the query and ranking samples by cosine similarity. The
ontology is meant to grow — add the words you actually use; FR + EN both welcome. CLAP
proposes, PLN's ear confirms ([[reference_audio_feature_stack]], [[feedback_sample_identity]]).
python3 sample_semantics.py tags <folder|file> # multi-axis tags for a sound
python3 sample_semantics.py search "lush warm rhodes" [--all] [--limit N]
python3 sample_semantics.py embed [--all] [--limit N] # cache audio embeds → semantics_embeds.npz
Reuses the validated CLAP primitives from sample_classify (text tower encoded once,
audio via get_audio_features().pooler_output, normalized — identical to a full forward).
"""
import json
import sys
from pathlib import Path
import numpy as np
import sample_classify as CLF
HERE = Path(__file__).resolve().parent
EMBEDS = HERE / "semantics_embeds.npz" # cached per-file audio embeddings (vibe search)
OUT = HERE / "sample_semantics.json"
# ── the multi-axis descriptor ontology ────────────────────────────────────────
# Each axis is scored INDEPENDENTLY (own softmax) — a sound is e.g. instrument=rhodes,
# timbre=warm+lush, scene=nu-jazz, mood=dreamy all at once. Phrases drop into the
# "this is the sound of {phrase}" frame CLAP was trained on. PLN's own words are folded
# in verbatim where they're vivid (lush, ténébreux, g-funk).
AXES = {
"instrument": [
"a kick drum", "a snare drum", "a hi-hat", "hand percussion", "a drum break",
"a sub bass", "an acid bassline", "a grand piano", "a rhodes electric piano",
"a hammond organ", "an electric guitar", "a synth lead", "a synth pad",
"a string ensemble", "a brass section", "a saxophone", "a flute",
"a choir of voices", "a solo singing voice", "a spoken voice", "a bell or mallet",
"a plucked string instrument", "a sitar or eastern string instrument",
],
"timbre": [
"a warm sound", "a dark sound", "a bright sparkling sound", "a lush rich sound",
"a gritty distorted sound", "a smooth clean sound", "a harsh aggressive sound",
"a dusty lo-fi sound", "a glassy crystalline sound", "a woody organic sound",
"a metallic sound", "an airy breathy sound", "a fat thick sound",
"a thin sound", "a shadowy ténébreux sound", "a dreamy washed-out sound",
"a punchy tight sound", "a muddy boomy sound",
],
"scene": [
"nu jazz", "jazz", "californian g-funk", "boom bap hip hop", "trap",
"deep house", "techno", "acid", "jungle drum and bass", "dub reggae",
"ambient", "disco funk", "afrobeat", "indian classical raga", "latin",
"cinematic orchestral", "soul and motown", "chiptune", "garage",
],
"mood": [
"sombre and melancholic", "euphoric and uplifting", "dreamy and floating",
"aggressive and intense", "playful and quirky", "tense and suspenseful",
"hypnotic and driving", "warm and nostalgic", "cold and clinical",
"epic and triumphant", "intimate and tender", "dark and brooding",
],
}
# ── CLAP primitives (own text embeds per axis; shares the model load) ──────────
_S = {}
def _embed_texts(prompts):
"""Normalized CLAP text embeddings for a list of phrases (uses sample_classify's
loaded model; .pooler_output is the projected joint embedding — verified)."""
C = CLF._clap() # ensures model is loaded
torch = C["torch"]
proc, model = C["proc"], C["model"]
ti = proc(text=["this is the sound of " + p for p in prompts],
return_tensors="pt", padding=True)
with torch.no_grad():
te = model.get_text_features(input_ids=ti["input_ids"],
attention_mask=ti["attention_mask"]).pooler_output
te = te / te.norm(p=2, dim=-1, keepdim=True)
return te
def _axis_embeds():
"""Cache: {axis: (labels, text_embed_tensor)} for every axis."""
if "axes" not in _S:
_S["axes"] = {ax: (labels, _embed_texts(labels)) for ax, labels in AXES.items()}
return _S["axes"]
def embed_audios(paths):
"""Normalized CLAP audio embeddings for paths → (np.ndarray (n×512), kept_paths).
Decodes via sample_classify.load_audio (48k mono, thread-pooled)."""
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=4) as ex:
decoded = list(ex.map(CLF.load_audio, paths))
keep = [(p, a) for p, a in zip(paths, decoded) if a is not None]
if not keep:
return np.zeros((0, 512), dtype=np.float32), []
C = CLF._clap()
torch = C["torch"]
with torch.no_grad():
ai = C["proc"](audio=[a for _, a in keep], sampling_rate=CLF.SR,
return_tensors="pt", padding=True)
ae = C["model"].get_audio_features(input_features=ai["input_features"]).pooler_output
ae = ae / ae.norm(p=2, dim=-1, keepdim=True)
return ae.cpu().numpy(), [p for p, _ in keep]
# ── tagging: multi-axis labels for one sound ──────────────────────────────────
def tag_audio_embed(ae_row, topk=3):
"""One normalized audio-embed row → {axis: [(label, prob)...topk]}."""
C = CLF._clap()
torch = C["torch"]
ten = torch.tensor(ae_row).unsqueeze(0)
scale = C["logit_scale"]
out = {}
for ax, (labels, te) in _axis_embeds().items():
logits = (scale * ten @ te.t()).squeeze(0)
probs = torch.softmax(logits, dim=-1)
top = sorted(zip(labels, probs.tolist()), key=lambda t: -t[1])[:topk]
out[ax] = [(lab, round(p, 3)) for lab, p in top]
return out
def tag_folder(name, topk=3):
files = CLF.folder_files(name)
if not files:
return None
ae, kept = embed_audios(files[:CLF.MAX_FILES])
if not len(ae):
return None
mean = ae.mean(0, keepdims=True)
mean = mean / (np.linalg.norm(mean) + 1e-9)
return tag_audio_embed(mean[0], topk=topk)
# ── vibe search: free-text query → ranked samples ─────────────────────────────
def vibe_search(query, topn=15):
if not EMBEDS.exists():
sys.exit(f"no {EMBEDS.name} — run `python3 sample_semantics.py embed --all` first.")
z = np.load(EMBEDS, allow_pickle=True)
M, names = z["embeds"], z["names"]
qe = _embed_texts([query]).cpu().numpy()[0]
sims = M @ qe
order = np.argsort(-sims)[:topn]
return [(str(names[i]), round(float(sims[i]), 3)) for i in order]
# ── commands ──────────────────────────────────────────────────────────────────
def cmd_tags(target):
p = Path(target)
if p.is_file():
ae, _ = embed_audios([p])
if not len(ae):
sys.exit("decode failed")
tags = tag_audio_embed(ae[0])
else:
tags = tag_folder(target)
if not tags:
sys.exit(f"no folder/files for {target!r}")
print(f"🏷️ {target}")
for ax, items in tags.items():
print(f" {ax:<11} " + ", ".join(f"{lab} ({p:.2f})" for lab, p in items))
def cmd_embed(all_folders=False, limit=None):
folders = (sorted(d.name for d in CLF.DIRT.iterdir() if d.is_dir())
if all_folders else [s for s in CLF.corpus_sounds() if CLF.folder_files(s)])
if limit:
folders = folders[:limit]
print(f"⛵ embedding samples from {len(folders)} folders for vibe search…")
embs, names = [], []
for i, name in enumerate(folders, 1):
files = CLF.folder_files(name)
if not files:
continue
ae, kept = embed_audios(files[:CLF.MAX_FILES])
for row, path in zip(ae, kept):
embs.append(row)
names.append(f"{name}/{Path(path).stem}")
if i % 10 == 0:
print(f" [{i}/{len(folders)}] {len(names)} samples embedded", flush=True)
M = np.array(embs, dtype=np.float32)
np.savez(EMBEDS, embeds=M, names=np.array(names, dtype=object))
print(f"✓ {EMBEDS.name}: {M.shape[0]} samples × {M.shape[1]}-d")
def cmd_search(query, limit=15):
print(f"🔎 vibe search: “{query}”\n")
for name, sim in vibe_search(query, topn=limit):
print(f" {sim:+.3f} {name}")
def main():
args = sys.argv[1:]
cmd = args[0] if args else None
if cmd == "tags" and len(args) > 1:
cmd_tags(args[1])
elif cmd == "embed":
cmd_embed(all_folders="--all" in args,
limit=int(args[args.index("--limit") + 1]) if "--limit" in args else None)
elif cmd == "search" and len(args) > 1:
cmd_search(args[1],
limit=int(args[args.index("--limit") + 1]) if "--limit" in args else 15)
else:
sys.exit('usage: sample_semantics.py [tags <folder|file> | '
'embed [--all] [--limit N] | search "<query>" [--limit N]]')
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment