From 70c6f07ef7ea05709a110fdc7710e44d413e4f5e Mon Sep 17 00:00:00 2001
From: Ivan <ivan.zderadicka@gmail.com>
Date: Mon, 8 Jun 2026 22:35:59 +0200
Subject: [PATCH 1/8] Add configuration to ort for better performance

---
 README.md          |  6 ++++-
 resources/piper.js |  8 +++++++
 serve.py           | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 serve.py

diff --git a/README.md b/README.md
index e2f09126..487f5ecb 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
 # Piper Samples
 
-Samples for [Piper](https://github.com/rhasspy/piper) text to speech system.
+
+Samples for [Piper](https://github.com/OHF-Voice/piper1-gpl) text to speech system.
+
+## Demo
+to run demo `python serve.py` - it provides COOP and COEP headers required for onnx runtime to run with multiple threads.
\ No newline at end of file
diff --git a/resources/piper.js b/resources/piper.js
index 82ebdb1a..0521ecc6 100644
--- a/resources/piper.js
+++ b/resources/piper.js
@@ -2,6 +2,14 @@
 
 import EspeakModule from "./espeakng.worker.js";
 
+// Run onnxruntime inference in a Web Worker so it doesn't block the UI thread.
+ort.env.wasm.proxy = true;
+
+// Use multiple threads for inference. This only takes effect when the page is
+// cross-origin isolated (COOP + COEP headers -> SharedArrayBuffer available);
+// otherwise onnxruntime-web silently falls back to a single thread. See serve.py.
+ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4;
+
 const AUDIO_OUTPUT_SYNCHRONOUS = 2;
 const espeakCHARS_AUTO = 0;
 
diff --git a/serve.py b/serve.py
new file mode 100644
index 00000000..b835ca54
--- /dev/null
+++ b/serve.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""Static server for the Piper demo with cross-origin isolation enabled.
+
+onnxruntime-web can only run multi-threaded WASM (using SharedArrayBuffer) when
+the page is "cross-origin isolated". That requires two response headers that the
+stock `python -m http.server` does not send:
+
+    Cross-Origin-Opener-Policy: same-origin
+    Cross-Origin-Embedder-Policy: credentialless
+
+We use `credentialless` rather than `require-corp`: both enable cross-origin
+isolation (and thus SharedArrayBuffer / threads), but `credentialless` still lets
+no-cors cross-origin assets load (e.g. the sponsor badge), instead of blocking
+anything that doesn't send a CORP header. Supported in Firefox 119+ / Chrome 110+.
+
+Run this from the demo directory:
+
+    python serve.py            # serves on http://localhost:8080
+    python serve.py 8080       # custom port
+
+After loading the page, confirm in the browser console:
+
+    crossOriginIsolated === true
+
+If that is false, the headers are not reaching the browser and inference will
+stay single-threaded.
+"""
+
+import sys
+from functools import partial
+from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+
+
+class CrossOriginIsolatedHandler(SimpleHTTPRequestHandler):
+    def end_headers(self):
+        self.send_header("Cross-Origin-Opener-Policy", "same-origin")
+        self.send_header("Cross-Origin-Embedder-Policy", "credentialless")
+        # Avoid stale assets while developing.
+        self.send_header("Cache-Control", "no-store")
+        super().end_headers()
+
+
+def main():
+    port = int(sys.argv[1]) if len(sys.argv) > 1 else 8080
+    handler = partial(CrossOriginIsolatedHandler, directory=".")
+    with ThreadingHTTPServer(("0.0.0.0", port), handler) as httpd:
+        print(f"Serving cross-origin-isolated demo on http://localhost:{port}")
+        print("Confirm `crossOriginIsolated === true` in the browser console.")
+        try:
+            httpd.serve_forever()
+        except KeyboardInterrupt:
+            print("\nStopped.")
+
+
+if __name__ == "__main__":
+    main()

From 61761ceaa81926ec168cfa8cba6537c9a366e60b Mon Sep 17 00:00:00 2001
From: Ivan <ivan.zderadicka@gmail.com>
Date: Tue, 9 Jun 2026 08:08:38 +0200
Subject: [PATCH 2/8] working per sequence

---
 resources/demo.js  | 119 +++++++++++++++++++++++++++++++++----
 resources/piper.js | 144 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 210 insertions(+), 53 deletions(-)

diff --git a/resources/demo.js b/resources/demo.js
index 30d53106..237fe3e5 100644
--- a/resources/demo.js
+++ b/resources/demo.js
@@ -1,9 +1,56 @@
-import { setVoice, textToWavAudio } from "./piper.js";
+import {
+  setVoice,
+  textToAudioSentences,
+  float32ToWavBlob,
+  getSampleRate,
+} from "./piper.js";
 
 let voiceUrl = "";
 let loadedVoiceUrl = "";
 let voiceConfigUrl = "";
 
+// Silence inserted between sentences, both for live playback scheduling and in the
+// assembled WAV so replay matches the stream. Tune to taste.
+const SENTENCE_GAP_SECONDS = 0.2;
+
+// Web Audio playback state (created lazily on first user gesture, reused after).
+let audioCtx = null;
+// Bumped on each Speak click so an in-flight stream knows to abort.
+let playbackGeneration = 0;
+// Source nodes scheduled for the current playback, so we can stop them on re-click.
+let activeSources = [];
+
+function stopPlayback() {
+  for (const src of activeSources) {
+    try {
+      src.stop();
+    } catch {
+      // Already stopped/ended.
+    }
+  }
+  activeSources = [];
+}
+
+// Concatenate per-sentence Float32 chunks with SENTENCE_GAP_SECONDS of silence
+// between them, matching what was played live.
+function joinWithGaps(chunks, sampleRate) {
+  const gapSamples = Math.round(SENTENCE_GAP_SECONDS * sampleRate);
+  const total =
+    chunks.reduce((n, c) => n + c.length, 0) +
+    gapSamples * Math.max(0, chunks.length - 1);
+
+  const out = new Float32Array(total);
+  let offset = 0;
+  chunks.forEach((chunk, i) => {
+    out.set(chunk, offset);
+    offset += chunk.length;
+    if (i < chunks.length - 1) {
+      offset += gapSamples; // leave zeros (silence)
+    }
+  });
+  return out;
+}
+
 async function main() {
   const fileModel = document.getElementById("fileModel");
   const fileConfig = document.getElementById("fileConfig");
@@ -97,18 +144,68 @@ async function main() {
       noiseWScale = null;
     }
 
+    // Stop any in-progress playback and mark this as the current generation.
+    const generation = ++playbackGeneration;
+    stopPlayback();
+
+    if (!audioCtx) {
+      audioCtx = new AudioContext();
+    }
+    await audioCtx.resume(); // requires a user gesture, which this click is
+
+    const sampleRate = getSampleRate();
+    const chunks = [];
+    let nextStartTime = 0;
+
     status.innerHTML = "Synthesizing audio...";
-    const wavAudio = await textToWavAudio(
-      text,
-      speakerId,
-      lengthScale,
-      noiseScale,
-      noiseWScale,
-    );
-    const audioURL = URL.createObjectURL(wavAudio);
+    try {
+      for await (const audio of textToAudioSentences(
+        text,
+        speakerId,
+        lengthScale,
+        noiseScale,
+        noiseWScale,
+      )) {
+        // A newer click superseded us while we were synthesizing.
+        if (generation !== playbackGeneration) {
+          return;
+        }
+
+        chunks.push(audio);
+
+        // Schedule this sentence to play right after the previous one.
+        const buffer = audioCtx.createBuffer(1, audio.length, sampleRate);
+        buffer.copyToChannel(audio, 0);
+        const source = audioCtx.createBufferSource();
+        source.buffer = buffer;
+        source.connect(audioCtx.destination);
+
+        if (nextStartTime === 0) {
+          nextStartTime = audioCtx.currentTime + 0.1; // small lead-in
+          status.innerHTML = "Playing...";
+        }
+        // Never schedule in the past: a slow synth yields a gap, not an overlap.
+        nextStartTime = Math.max(nextStartTime, audioCtx.currentTime);
+        source.start(nextStartTime);
+        nextStartTime += buffer.duration + SENTENCE_GAP_SECONDS;
 
-    audioTTS.src = audioURL;
-    audioTTS.play();
+        activeSources.push(source);
+      }
+    } catch (e) {
+      status.innerHTML = "Error";
+      throw e;
+    }
+
+    if (generation !== playbackGeneration) {
+      return;
+    }
+
+    // Hybrid: assemble the full WAV so the <audio> element supports replay/seek/download.
+    // Do not auto-play - it is already playing via Web Audio.
+    if (chunks.length > 0) {
+      const full = joinWithGaps(chunks, sampleRate);
+      audioTTS.src = URL.createObjectURL(float32ToWavBlob(full, sampleRate));
+    }
 
     status.innerHTML = "Ready";
   });
diff --git a/resources/piper.js b/resources/piper.js
index 0521ecc6..4c0ce787 100644
--- a/resources/piper.js
+++ b/resources/piper.js
@@ -57,27 +57,84 @@ async function setVoice(voiceModelUrl, voiceConfigUrl = undefined) {
   voiceModel = await ort.InferenceSession.create(voiceModelUrl);
 }
 
+function getSampleRate() {
+  if (!voiceConfig) {
+    throw new Error("Voice is not set");
+  }
+  return voiceConfig.audio.sample_rate;
+}
+
+// Resolve scale arguments, falling back to the voice config defaults.
+function resolveScales(lengthScale, noiseScale, noiseWScale) {
+  return {
+    lengthScale: lengthScale ?? voiceConfig.inference.length_scale ?? 1.0,
+    noiseScale: noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667,
+    noiseWScale: noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8,
+  };
+}
+
+// Run the ONNX model on a single utterance's phoneme ids, returning Float32 PCM.
+async function synthesizeIds(
+  phonemeIds,
+  speakerId,
+  lengthScale,
+  noiseScale,
+  noiseWScale,
+) {
+  const phonemeIdsTensor = new ort.Tensor(
+    "int64",
+    new BigInt64Array(phonemeIds.map((x) => BigInt(x))),
+    [1, phonemeIds.length],
+  );
+  const phonemeLengthsTensor = new ort.Tensor(
+    "int64",
+    BigInt64Array.from([BigInt(phonemeIds.length)]),
+    [1],
+  );
+  const scalesTensor = new ort.Tensor(
+    "float32",
+    Float32Array.from([noiseScale, lengthScale, noiseWScale]),
+    [3],
+  );
+
+  let feeds = {
+    input: phonemeIdsTensor,
+    input_lengths: phonemeLengthsTensor,
+    scales: scalesTensor,
+  };
+
+  if (voiceConfig.num_speakers > 1) {
+    // Multi-speaker
+    feeds["sid"] = new ort.Tensor(
+      "int64",
+      BigInt64Array.from([BigInt(speakerId ?? 0)]),
+    );
+  }
+
+  const results = await voiceModel.run(feeds);
+  return results.output.cpuData;
+}
+
 async function textToWavAudio(
   text,
   speakerId = undefined,
-  noiseScale = undefined,
   lengthScale = undefined,
+  noiseScale = undefined,
   noiseWScale = undefined,
 ) {
   if (!voiceConfig) {
     throw new Error("Voice is not set");
   }
 
-  const sampleRate = voiceConfig.audio.sample_rate;
   const float32Audio = await textToFloat32Audio(
     text,
     speakerId,
-    noiseScale,
     lengthScale,
+    noiseScale,
     noiseWScale,
   );
 
-  return float32ToWavBlob(float32Audio, sampleRate);
+  return float32ToWavBlob(float32Audio, getSampleRate());
 }
 
 async function textToFloat32Audio(
@@ -91,52 +148,48 @@ async function textToFloat32Audio(
     throw new Error("Voice is not set");
   }
 
-  lengthScale = lengthScale ?? voiceConfig.inference.length_scale ?? 1.0;
-  noiseScale = noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667;
-  noiseWScale = noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8;
-
-  if (voiceConfig.num_speakers > 1) {
-    speakerId = speakerId ?? 0; // first speaker
-  }
+  const scales = resolveScales(lengthScale, noiseScale, noiseWScale);
 
   const textPhonemes = textToPhonemes(text);
   const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes);
 
-  // Run onnx model
-  const phonemeIdsTensor = new ort.Tensor(
-    "int64",
-    new BigInt64Array(phonemeIds.map((x) => BigInt(x))),
-    [1, phonemeIds.length],
-  );
-  const phonemeLengthsTensor = new ort.Tensor(
-    "int64",
-    BigInt64Array.from([BigInt(phonemeIds.length)]),
-    [1],
-  );
-  const scalesTensor = new ort.Tensor(
-    "float32",
-    Float32Array.from([noiseScale, lengthScale, noiseWScale]),
-    [3],
+  return synthesizeIds(
+    phonemeIds,
+    speakerId,
+    scales.lengthScale,
+    scales.noiseScale,
+    scales.noiseWScale,
   );
+}
 
-  let feeds = {
-    input: phonemeIdsTensor,
-    input_lengths: phonemeLengthsTensor,
-    scales: scalesTensor,
-  };
-
-  if (voiceConfig.num_speakers > 1) {
-    // Multi-speaker
-    feeds["sid"] = new ort.Tensor(
-      "int64",
-      BigInt64Array.from([BigInt(speakerId)]),
-    );
+// Synthesize a sentence at a time, yielding Float32 PCM for each as soon as it is
+// ready. Lets the caller start playing early instead of waiting for the whole text.
+async function* textToAudioSentences(
+  text,
+  speakerId = undefined,
+  lengthScale = undefined,
+  noiseScale = undefined,
+  noiseWScale = undefined,
+) {
+  if (!voiceConfig) {
+    throw new Error("Voice is not set");
   }
 
-  const results = await voiceModel.run(feeds);
-  const float32Audio = results.output.cpuData;
+  const scales = resolveScales(lengthScale, noiseScale, noiseWScale);
+
+  // textToPhonemes already segments into per-sentence phoneme arrays.
+  const sentences = textToPhonemes(text);
 
-  return float32Audio;
+  for (const sentence of sentences) {
+    const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, [sentence]);
+    yield await synthesizeIds(
+      phonemeIds,
+      speakerId,
+      scales.lengthScale,
+      scales.noiseScale,
+      scales.noiseWScale,
+    );
+  }
 }
 
 function textToPhonemes(text) {
@@ -308,4 +361,11 @@ function float32ToWavBlob(floatArray, sampleRate) {
   return new Blob([view], { type: "audio/wav" });
 }
 
-export { setVoice, textToWavAudio, textToFloat32Audio };
+export {
+  setVoice,
+  textToWavAudio,
+  textToFloat32Audio,
+  textToAudioSentences,
+  float32ToWavBlob,
+  getSampleRate,
+};

From 5c56e41d43b93a56b560c1332547e43c17edc372 Mon Sep 17 00:00:00 2001
From: Ivan <ivan.zderadicka@gmail.com>
Date: Wed, 10 Jun 2026 19:45:34 +0200
Subject: [PATCH 3/8] Show progress in text

---
 .claude/settings.json |  10 ++
 .gitignore            |   2 +
 demo.html             |  26 ++++-
 resources/demo.js     | 220 ++++++++++++++++++++++++++++++++++--------
 resources/piper.js    | 108 ++++++++++++++++++---
 5 files changed, 307 insertions(+), 59 deletions(-)
 create mode 100644 .claude/settings.json

diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 00000000..d8c9779f
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,10 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(rm -f /tmp/bytemap_test.mjs; find . -name \"*.onnx\" 2>/dev/null | head; echo \"---configs---\"; ls configs/ 2>/dev/null | head)"
+    ],
+    "additionalDirectories": [
+      "/tmp"
+    ]
+  }
+}
diff --git a/.gitignore b/.gitignore
index c995e604..2d6ba342 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,5 @@ output/
 /install/
 
 .python-version
+
+.vscode/
diff --git a/demo.html b/demo.html
index e63b9396..bd79f20a 100644
--- a/demo.html
+++ b/demo.html
@@ -9,8 +9,30 @@
           max-width: 90%;
       }
 
-      #textInput {
+      /* Shared box so the highlight view and the textarea look identical when swapped. */
+      #textInput, #highlightView {
           width: 100%;
+          box-sizing: border-box;
+          font-family: inherit;
+          font-size: 1em;
+          line-height: 1.4;
+          padding: 4px;
+          border: 1px solid #767676;
+          border-radius: 2px;
+      }
+
+      #highlightView {
+          min-height: calc(5 * 1.4em + 8px); /* ~5 rows + padding, matching the textarea */
+          max-height: 40vh;
+          overflow: auto;
+          white-space: pre-wrap;
+          word-wrap: break-word;
+          background: #fff;
+      }
+
+      #highlightView .sentence.active {
+          background: #ffe08a;
+          border-radius: 2px;
       }
 
       #logo {
@@ -75,10 +97,10 @@
     </div>
 
     <textarea id="textInput" rows=5 aria-label="Text to speak" disabled></textarea>
+    <div id="highlightView" aria-live="polite" hidden></div>
 
     <div id="divSpeak">
       <button id="buttonSpeak" aria-label="Speak text" disabled>Speak</button>
-      <audio id="audioTTS" aria-label="Spoken audio" controls></audio>
       <span id="status" role="status" aria-live="polite">Ready</span>
     </div>
 
diff --git a/resources/demo.js b/resources/demo.js
index 237fe3e5..e95d6bdb 100644
--- a/resources/demo.js
+++ b/resources/demo.js
@@ -20,6 +20,31 @@ let playbackGeneration = 0;
 // Source nodes scheduled for the current playback, so we can stop them on re-click.
 let activeSources = [];
 
+// Live-highlight state. `spans` are the per-sentence elements in the read-only view;
+// `segments[i]` is { startTime, endTime } on the audio clock for spans[i]. The rAF loop
+// matches audioCtx.currentTime against the segments to highlight the playing sentence.
+let highlightSpans = [];
+let highlightSegments = [];
+let highlightRAF = null;
+let activeHighlight = -1;
+// True once the synth loop has scheduled every sentence, so the rAF loop knows it can end
+// when the audio clock passes the last segment (rather than stopping mid-stream).
+let highlightSynthDone = false;
+
+function clearHighlight() {
+  if (activeHighlight >= 0 && highlightSpans[activeHighlight]) {
+    highlightSpans[activeHighlight].classList.remove("active");
+  }
+  activeHighlight = -1;
+}
+
+function stopHighlightLoop() {
+  if (highlightRAF !== null) {
+    cancelAnimationFrame(highlightRAF);
+    highlightRAF = null;
+  }
+}
+
 function stopPlayback() {
   for (const src of activeSources) {
     try {
@@ -29,26 +54,9 @@ function stopPlayback() {
     }
   }
   activeSources = [];
-}
-
-// Concatenate per-sentence Float32 chunks with SENTENCE_GAP_SECONDS of silence
-// between them, matching what was played live.
-function joinWithGaps(chunks, sampleRate) {
-  const gapSamples = Math.round(SENTENCE_GAP_SECONDS * sampleRate);
-  const total =
-    chunks.reduce((n, c) => n + c.length, 0) +
-    gapSamples * Math.max(0, chunks.length - 1);
-
-  const out = new Float32Array(total);
-  let offset = 0;
-  chunks.forEach((chunk, i) => {
-    out.set(chunk, offset);
-    offset += chunk.length;
-    if (i < chunks.length - 1) {
-      offset += gapSamples; // leave zeros (silence)
-    }
-  });
-  return out;
+  stopHighlightLoop();
+  clearHighlight();
+  highlightSegments = [];
 }
 
 async function main() {
@@ -58,12 +66,15 @@ async function main() {
   const buttonSpeak = document.getElementById("buttonSpeak");
   const audioTTS = document.getElementById("audioTTS");
   const textInput = document.getElementById("textInput");
+  const highlightView = document.getElementById("highlightView");
   const status = document.getElementById("status");
   const speakerSelect = document.getElementById("speaker");
   const inputLengthScale = document.getElementById("lengthScale");
   const inputNoiseScale = document.getElementById("noiseScale");
   const inputNoiseWScale = document.getElementById("noiseWScale");
 
+  var speaking = false;
+
   fileModel.addEventListener("change", async () => {
     const file = event.target.files[0];
     if (!file) {
@@ -84,6 +95,21 @@ async function main() {
       const voiceConfig = await response.json();
       updateUIForConfig(voiceConfig);
       divConfig.hidden = true;
+
+      if (voiceUrl != loadedVoiceUrl) {
+        status.innerHTML = "Loading voice...";
+        try {
+          await setVoice(voiceUrl, voiceConfigUrl);
+        }
+        catch (e) {
+          status.innerHTML = "Error loading voice";
+          throw e;
+        }
+        loadedVoiceUrl = voiceUrl;
+      }
+
+      status.innerHTML = "Ready";
+      buttonSpeak.disabled = false;
     } else {
       divConfig.hidden = false;
       speakerSelect.hidden = true;
@@ -99,15 +125,77 @@ async function main() {
     const voiceConfig = JSON.parse(await file.text());
     updateUIForConfig(voiceConfig);
     voiceConfigUrl = URL.createObjectURL(file);
+    status.innerHTML = "Ready";
+    buttonSpeak.disabled = false;
   });
 
-  buttonSpeak.addEventListener("click", async () => {
+  function showHighlightView() {
+    textInput.hidden = true;
+    highlightView.hidden = false;
+  }
+
+  function showEditor() {
+    highlightView.hidden = true;
+    textInput.hidden = false;
+  }
+
+  // Reset the read-only view to empty, ready to receive per-sentence spans.
+  function resetHighlightView() {
+    highlightView.textContent = "";
+    highlightSpans = [];
+  }
+
+  // Final cleanup when playback ends naturally: drop the highlight, return to the editor,
+  // and reset the UI to idle.
+  function finishPlayback() {
+    clearHighlight();
+    showEditor();
+    status.innerHTML = "Ready";
+    buttonSpeak.innerHTML = "Speak";
+    speaking = false;
+  }
+
+  // Poll the audio clock each frame and light up whichever sentence span is currently
+  // playing. BufferSource has no "start" event, so matching audioCtx.currentTime against
+  // the segment table is the reliable trigger, and it self-corrects against scheduling
+  // gaps. Runs until superseded or the audio passes the last scheduled segment.
+  function startHighlightLoop(generation) {
+    const tick = () => {
+      if (generation !== playbackGeneration) {
+        return; // Superseded; stopPlayback already cleaned up.
+      }
+      const t = audioCtx.currentTime;
+      const i = highlightSegments.findIndex(
+        (seg) => t >= seg.startTime && t < seg.endTime,
+      );
+      // Keep the current sentence lit through inter-sentence gaps (i === -1); only switch
+      // when a new sentence actually starts.
+      if (i >= 0 && i !== activeHighlight) {
+        clearHighlight();
+        highlightSpans[i].classList.add("active");
+        highlightSpans[i].scrollIntoView({ block: "nearest" });
+        activeHighlight = i;
+      }
+
+      const last = highlightSegments[highlightSegments.length - 1];
+      if (highlightSynthDone && (!last || t >= last.endTime)) {
+        highlightRAF = null;
+        finishPlayback();
+      } else {
+        highlightRAF = requestAnimationFrame(tick);
+      }
+    };
+    stopHighlightLoop();
+    highlightRAF = requestAnimationFrame(tick);
+  }
+
+  async function speak() {
     if (!voiceUrl) {
       alert("Voice model is not set");
       return;
     }
 
-    if (!voiceConfigUrl) {
+    if (!loadedVoiceUrl) {
       alert("Voice config is not set");
       return;
     }
@@ -118,11 +206,7 @@ async function main() {
       return;
     }
 
-    if (voiceUrl != loadedVoiceUrl) {
-      status.innerHTML = "Loading voice...";
-      await setVoice(voiceUrl, voiceConfigUrl);
-      loadedVoiceUrl = voiceUrl;
-    }
+
 
     let speakerId = null;
     if (speakerSelect.selectedIndex > 0) {
@@ -157,9 +241,17 @@ async function main() {
     const chunks = [];
     let nextStartTime = 0;
 
+    // Swap the editable textarea for the read-only highlight view, which fills in sentence
+    // by sentence as synthesis progresses.
+    highlightSynthDone = false;
+    resetHighlightView();
+    showHighlightView();
+    let viewCursor = 0; // Char offset already emitted into the view.
+    let loopStarted = false;
+
     status.innerHTML = "Synthesizing audio...";
     try {
-      for await (const audio of textToAudioSentences(
+      for await (const { audio, start, end } of textToAudioSentences(
         text,
         speakerId,
         lengthScale,
@@ -173,6 +265,20 @@ async function main() {
 
         chunks.push(audio);
 
+        // Append any text between the previous sentence and this one as plain text, then
+        // the sentence itself as a highlightable span. spans and segments stay in lock-step.
+        if (start > viewCursor) {
+          highlightView.appendChild(
+            document.createTextNode(text.slice(viewCursor, start)),
+          );
+        }
+        const span = document.createElement("span");
+        span.className = "sentence";
+        span.textContent = text.slice(start, end);
+        highlightView.appendChild(span);
+        highlightSpans.push(span);
+        viewCursor = end;
+
         // Schedule this sentence to play right after the previous one.
         const buffer = audioCtx.createBuffer(1, audio.length, sampleRate);
         buffer.copyToChannel(audio, 0);
@@ -182,36 +288,68 @@ async function main() {
 
         if (nextStartTime === 0) {
           nextStartTime = audioCtx.currentTime + 0.1; // small lead-in
-          status.innerHTML = "Playing...";
+          status.innerHTML = "Speaking...";
         }
         // Never schedule in the past: a slow synth yields a gap, not an overlap.
         nextStartTime = Math.max(nextStartTime, audioCtx.currentTime);
-        source.start(nextStartTime);
+        const startedAt = nextStartTime;
+        source.start(startedAt);
         nextStartTime += buffer.duration + SENTENCE_GAP_SECONDS;
 
         activeSources.push(source);
+        highlightSegments.push({
+          startTime: startedAt,
+          endTime: startedAt + buffer.duration,
+        });
+
+        if (!loopStarted) {
+          loopStarted = true;
+          startHighlightLoop(generation);
+        }
       }
     } catch (e) {
-      status.innerHTML = "Error";
+      status.innerHTML = "Error while synthesizing";
+      stopPlayback();
+      showEditor();
       throw e;
     }
 
-    if (generation !== playbackGeneration) {
-      return;
+    // All sentences scheduled. Let the rAF loop end naturally once the audio plays out;
+    // if nothing was produced, there is no loop to restore the editor, so do it here.
+    highlightSynthDone = true;
+    if (!loopStarted) {
+      finishPlayback();
     }
+  }
 
-    // Hybrid: assemble the full WAV so the <audio> element supports replay/seek/download.
-    // Do not auto-play - it is already playing via Web Audio.
-    if (chunks.length > 0) {
-      const full = joinWithGaps(chunks, sampleRate);
-      audioTTS.src = URL.createObjectURL(float32ToWavBlob(full, sampleRate));
-    }
 
-    status.innerHTML = "Ready";
+  buttonSpeak.addEventListener("click", async () => {
+    if (!speaking) {
+      speaking = true;
+      buttonSpeak.innerHTML = "Stop";
+      try {
+        // Stays "speaking" through playback; finishPlayback() resets the UI when the
+        // audio plays out. speak() resolves once synthesis is scheduled, not when audio ends.
+        await speak();
+      } catch {
+        // speak() already restored the editor and set an error status.
+        speaking = false;
+        buttonSpeak.innerHTML = "Speak";
+      }
+    } else {
+      // If the user clicks Stop while we're still speaking, stop immediately.
+      playbackGeneration++;
+      stopPlayback();
+      showEditor();
+      speaking = false;
+      status.innerHTML = "Ready";
+      buttonSpeak.innerHTML = "Speak";
+    }
   });
 
   textInput.disabled = false;
-  buttonSpeak.disabled = false;
+  buttonSpeak.disabled = true;
+  status.innerHTML = "Load voice to begin";
   fileModel.value = "";
   fileConfig.value = "";
 }
diff --git a/resources/piper.js b/resources/piper.js
index 4c0ce787..77a62875 100644
--- a/resources/piper.js
+++ b/resources/piper.js
@@ -115,6 +115,8 @@ async function synthesizeIds(
   return results.output.cpuData;
 }
 
+// Currently unused by the demo (kept for the public API; the demo streams via
+// textToAudioSentences instead).
 async function textToWavAudio(
   text,
   speakerId = undefined,
@@ -137,6 +139,8 @@ async function textToWavAudio(
   return float32ToWavBlob(float32Audio, getSampleRate());
 }
 
+// Currently unused by the demo (kept for the public API; the demo streams via
+// textToAudioSentences instead).
 async function textToFloat32Audio(
   text,
   speakerId = undefined,
@@ -150,7 +154,7 @@ async function textToFloat32Audio(
 
   const scales = resolveScales(lengthScale, noiseScale, noiseWScale);
 
-  const textPhonemes = textToPhonemes(text);
+  const textPhonemes = textToPhonemes(text).map((segment) => segment.phonemes);
   const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes);
 
   return synthesizeIds(
@@ -177,29 +181,53 @@ async function* textToAudioSentences(
 
   const scales = resolveScales(lengthScale, noiseScale, noiseWScale);
 
-  // textToPhonemes already segments into per-sentence phoneme arrays.
+  // textToPhonemes already segments into per-sentence { phonemes, start, end }.
   const sentences = textToPhonemes(text);
 
   for (const sentence of sentences) {
-    const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, [sentence]);
-    yield await synthesizeIds(
+    const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, [sentence.phonemes]);
+    const audio = await synthesizeIds(
       phonemeIds,
       speakerId,
       scales.lengthScale,
       scales.noiseScale,
       scales.noiseWScale,
     );
+    // start/end are character indices into `text`, so the caller can highlight the slice
+    // this audio was synthesized from.
+    yield { audio, start: sentence.start, end: sentence.end };
   }
 }
 
+// Map a UTF-8 byte offset to a JavaScript string (UTF-16 code unit) index. espeak works
+// on the UTF-8 buffer, but the displayed text is indexed in JS string units, so byte
+// offsets must be translated before they can be used to slice/highlight the original text.
+function buildByteToCharMap(text) {
+  const map = new Map();
+  const encoder = new TextEncoder();
+  let byte = 0;
+  let char = 0;
+  map.set(0, 0);
+  for (const ch of text) {
+    // Iterating a string yields whole code points, so astral chars stay intact.
+    byte += encoder.encode(ch).length;
+    char += ch.length; // 2 for surrogate pairs, matching String indexing.
+    map.set(byte, char);
+  }
+  return map;
+}
+
+// Segment text into per-sentence units. Returns an array of
+// { phonemes, start, end } where start/end are character indices into the original
+// `text`, identifying the slice each sentence was synthesized from.
 function textToPhonemes(text) {
   if (!voiceConfig) {
     throw new Error("Voice is not set");
   }
 
   if (voiceConfig.phoneme_type == "text") {
-    // Text phonemes
-    return [Array.from(text.normalize("NFD"))];
+    // Text phonemes: the whole text is a single sentence.
+    return [{ phonemes: Array.from(text.normalize("NFD")), start: 0, end: text.length }];
   }
 
   if (!espeakInstance) {
@@ -236,13 +264,42 @@ function textToPhonemes(text) {
   // End of clause and sentences
   const terminatorPtr = espeakInstance._malloc(4);
 
-  // Phoneme lists for each sentence
+  // Total UTF-8 byte length, used as the end offset for the final clause (where espeak
+  // sets the next-text pointer to 0 instead of a byte offset).
+  const totalBytes = espeakInstance.lengthBytesUTF8(text);
+
+  // espeak reports offsets into the UTF-8 buffer; convert them to character indices into
+  // the original `text` so they can slice/highlight it directly.
+  const byteToChar = buildByteToCharMap(text);
+  const toChar = (byte) => {
+    const char = byteToChar.get(byte);
+    if (char === undefined) {
+      // espeak landed on a byte offset that is not a character boundary in our map. This
+      // shouldn't happen (espeak advances by whole code points); warn loudly because the
+      // fallback below would silently mis-size the highlight.
+      console.warn(`piper: byte offset ${byte} has no character mapping`);
+      return text.length;
+    }
+    return Math.max(0, Math.min(text.length, char));
+  };
+
+  // Sentence segments, each { phonemes, start, end } in character indices.
   const textPhonemes = [];
 
   // Phoneme list for current sentence
   let sentencePhonemes = [];
 
+  // Character offsets: where the next clause begins, and where the current sentence
+  // (accumulation of clauses) began.
+  let cursorChar = 0;
+  let sentenceStartChar = 0;
+
   while (true) {
+    // A new sentence is starting if we haven't accumulated any clauses for it yet.
+    if (sentencePhonemes.length === 0) {
+      sentenceStartChar = cursorChar;
+    }
+
     const phonemesPtr = espeakInstance._espeak_TextToPhonemesWithTerminator(
       textPtrPtr,
       espeakCHARS_AUTO,
@@ -270,13 +327,28 @@ function textToPhonemes(text) {
       sentencePhonemes.push("; ");
     }
 
+    // Where espeak will resume. 0 means the input is exhausted (this clause runs to the
+    // end of the text). Otherwise espeak reads one lookahead character past the clause
+    // boundary, so its resume offset overshoots the true boundary by exactly one
+    // character — subtract it back off (in character space) to land on the start of the
+    // next clause.
+    const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*");
+    const endChar =
+      nextTextPtr === 0
+        ? text.length
+        : Math.max(cursorChar, toChar(nextTextPtr - textPtr) - 1);
+    cursorChar = endChar;
+
     if ((terminator & CLAUSE_TYPE_SENTENCE) === CLAUSE_TYPE_SENTENCE) {
       // End of sentence
-      textPhonemes.push(sentencePhonemes);
+      textPhonemes.push({
+        phonemes: sentencePhonemes,
+        start: sentenceStartChar,
+        end: endChar,
+      });
       sentencePhonemes = [];
     }
 
-    const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*");
     if (nextTextPtr === 0) {
       break; // All text processed
     }
@@ -292,16 +364,20 @@ function textToPhonemes(text) {
 
   // Add lingering phonemes
   if (sentencePhonemes.length > 0) {
-    textPhonemes.push(sentencePhonemes);
+    textPhonemes.push({
+      phonemes: sentencePhonemes,
+      start: sentenceStartChar,
+      end: text.length,
+    });
     sentencePhonemes = [];
   }
 
-  // Prepare phonemes for Piper
-  for (let i = 0; i < textPhonemes.length; i++) {
-    textPhonemes[i] = Array.from(textPhonemes[i].join("").normalize("NFD"));
-  }
-
-  return textPhonemes;
+  // Prepare phonemes for Piper; start/end are already character indices into `text`.
+  return textPhonemes.map((segment) => ({
+    phonemes: Array.from(segment.phonemes.join("").normalize("NFD")),
+    start: segment.start,
+    end: segment.end,
+  }));
 }
 
 function phonemesToIds(idMap, textPhonemes) {

From e6952c0b3ae5bdaeb46cc42ba0fdf6ece5e74af2 Mon Sep 17 00:00:00 2001
From: Ivan <ivan.zderadicka@gmail.com>
Date: Wed, 10 Jun 2026 20:12:56 +0200
Subject: [PATCH 4/8] Refactor - better way to map bytes to chars

---
 resources/piper.js | 59 ++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

diff --git a/resources/piper.js b/resources/piper.js
index 77a62875..2bf034ef 100644
--- a/resources/piper.js
+++ b/resources/piper.js
@@ -199,22 +199,30 @@ async function* textToAudioSentences(
   }
 }
 
-// Map a UTF-8 byte offset to a JavaScript string (UTF-16 code unit) index. espeak works
-// on the UTF-8 buffer, but the displayed text is indexed in JS string units, so byte
-// offsets must be translated before they can be used to slice/highlight the original text.
-function buildByteToCharMap(text) {
-  const map = new Map();
-  const encoder = new TextEncoder();
+function utf8ByteLength(codePoint) {
+  if (codePoint <= 0x7f) return 1;
+  if (codePoint <= 0x7ff) return 2;
+  if (codePoint <= 0xffff) return 3;
+  return 4;
+}
+
+// espeak reports clause boundaries as UTF-8 byte offsets, but the displayed text is
+// indexed in JS string units. Those offsets only ever move forward, so we translate them
+// with a single forward-walking cursor (no lookup table): each call advances through the
+// string until it reaches the requested byte offset and returns the character index there.
+function makeByteToCharCursor(text) {
   let byte = 0;
-  let char = 0;
-  map.set(0, 0);
-  for (const ch of text) {
-    // Iterating a string yields whole code points, so astral chars stay intact.
-    byte += encoder.encode(ch).length;
-    char += ch.length; // 2 for surrogate pairs, matching String indexing.
-    map.set(byte, char);
-  }
-  return map;
+  let char = 0; // JS string index == character index (surrogate pairs count as 2).
+  return (targetByte) => {
+    while (byte < targetByte && char < text.length) {
+      const codePoint = text.codePointAt(char);
+      byte += utf8ByteLength(codePoint);
+      // Advance one whole character: astral code points are a surrogate pair, so they
+      // occupy two UTF-16 string indices; everything in the BMP occupies one.
+      char += codePoint > 0xffff ? 2 : 1;
+    }
+    return char;
+  };
 }
 
 // Segment text into per-sentence units. Returns an array of
@@ -264,24 +272,9 @@ function textToPhonemes(text) {
   // End of clause and sentences
   const terminatorPtr = espeakInstance._malloc(4);
 
-  // Total UTF-8 byte length, used as the end offset for the final clause (where espeak
-  // sets the next-text pointer to 0 instead of a byte offset).
-  const totalBytes = espeakInstance.lengthBytesUTF8(text);
-
-  // espeak reports offsets into the UTF-8 buffer; convert them to character indices into
-  // the original `text` so they can slice/highlight it directly.
-  const byteToChar = buildByteToCharMap(text);
-  const toChar = (byte) => {
-    const char = byteToChar.get(byte);
-    if (char === undefined) {
-      // espeak landed on a byte offset that is not a character boundary in our map. This
-      // shouldn't happen (espeak advances by whole code points); warn loudly because the
-      // fallback below would silently mis-size the highlight.
-      console.warn(`piper: byte offset ${byte} has no character mapping`);
-      return text.length;
-    }
-    return Math.max(0, Math.min(text.length, char));
-  };
+  // Translates espeak's UTF-8 byte offsets to character indices into the original `text`
+  // so they can slice/highlight it directly.
+  const toChar = makeByteToCharCursor(text);
 
   // Sentence segments, each { phonemes, start, end } in character indices.
   const textPhonemes = [];

From b5ed790f34acf9a897d4c56150ddde6d97fb3368 Mon Sep 17 00:00:00 2001
From: Ivan <ivan.zderadicka@gmail.com>
Date: Wed, 10 Jun 2026 20:42:31 +0200
Subject: [PATCH 5/8] cleanup

---
 demo.html          |  5 ----
 resources/demo.js  | 57 +++++++++++++++++++---------------------------
 resources/piper.js | 23 +++++--------------
 3 files changed, 29 insertions(+), 56 deletions(-)

diff --git a/demo.html b/demo.html
index bd79f20a..1287ecb5 100644
--- a/demo.html
+++ b/demo.html
@@ -56,11 +56,6 @@
           margin-bottom: 20px;
       }
 
-      #divSpeak > audio {
-          vertical-align: bottom;
-          margin-left: 10px;
-      }
-
       #status {
           margin-left: 10px;
       }
diff --git a/resources/demo.js b/resources/demo.js
index e95d6bdb..7a2a0ee2 100644
--- a/resources/demo.js
+++ b/resources/demo.js
@@ -1,7 +1,6 @@
 import {
   setVoice,
   textToAudioSentences,
-  float32ToWavBlob,
   getSampleRate,
 } from "./piper.js";
 
@@ -9,8 +8,7 @@ let voiceUrl = "";
 let loadedVoiceUrl = "";
 let voiceConfigUrl = "";
 
-// Silence inserted between sentences, both for live playback scheduling and in the
-// assembled WAV so replay matches the stream. Tune to taste.
+// Silence inserted between sentences when scheduling live playback. Tune to taste.
 const SENTENCE_GAP_SECONDS = 0.2;
 
 // Web Audio playback state (created lazily on first user gesture, reused after).
@@ -45,6 +43,13 @@ function stopHighlightLoop() {
   }
 }
 
+// Read a numeric scale input, returning null when blank/invalid so piper falls back to the
+// voice config default.
+function parseScaleOrNull(input) {
+  const value = parseFloat(input.value);
+  return isNaN(value) ? null : value;
+}
+
 function stopPlayback() {
   for (const src of activeSources) {
     try {
@@ -64,7 +69,6 @@ async function main() {
   const fileConfig = document.getElementById("fileConfig");
   const divConfig = document.getElementById("divConfig");
   const buttonSpeak = document.getElementById("buttonSpeak");
-  const audioTTS = document.getElementById("audioTTS");
   const textInput = document.getElementById("textInput");
   const highlightView = document.getElementById("highlightView");
   const status = document.getElementById("status");
@@ -73,10 +77,10 @@ async function main() {
   const inputNoiseScale = document.getElementById("noiseScale");
   const inputNoiseWScale = document.getElementById("noiseWScale");
 
-  var speaking = false;
+  let speaking = false;
 
-  fileModel.addEventListener("change", async () => {
-    const file = event.target.files[0];
+  fileModel.addEventListener("change", async (e) => {
+    const file = e.target.files[0];
     if (!file) {
       return;
     }
@@ -116,8 +120,8 @@ async function main() {
     }
   });
 
-  fileConfig.addEventListener("change", async () => {
-    const file = event.target.files[0];
+  fileConfig.addEventListener("change", async (e) => {
+    const file = e.target.files[0];
     if (!file) {
       return;
     }
@@ -213,20 +217,9 @@ async function main() {
       speakerId = parseInt(speakerSelect.value);
     }
 
-    let lengthScale = parseFloat(inputLengthScale.value);
-    if (isNaN(lengthScale)) {
-      lengthScale = null;
-    }
-
-    let noiseScale = parseFloat(inputNoiseScale.value);
-    if (isNaN(noiseScale)) {
-      noiseScale = null;
-    }
-
-    let noiseWScale = parseFloat(inputNoiseWScale.value);
-    if (isNaN(noiseWScale)) {
-      noiseWScale = null;
-    }
+    const lengthScale = parseScaleOrNull(inputLengthScale);
+    const noiseScale = parseScaleOrNull(inputNoiseScale);
+    const noiseWScale = parseScaleOrNull(inputNoiseWScale);
 
     // Stop any in-progress playback and mark this as the current generation.
     const generation = ++playbackGeneration;
@@ -238,7 +231,6 @@ async function main() {
     await audioCtx.resume(); // requires a user gesture, which this click is
 
     const sampleRate = getSampleRate();
-    const chunks = [];
     let nextStartTime = 0;
 
     // Swap the editable textarea for the read-only highlight view, which fills in sentence
@@ -263,8 +255,6 @@ async function main() {
           return;
         }
 
-        chunks.push(audio);
-
         // Append any text between the previous sentence and this one as plain text, then
         // the sentence itself as a highlightable span. spans and segments stay in lock-step.
         if (start > viewCursor) {
@@ -373,19 +363,18 @@ function updateUIForConfig(voiceConfig) {
   } else {
     // Multi-speaker model
     const speakerIdMap = voiceConfig.speaker_id_map;
-    let sortedSpeakers = Object.keys(speakerIdMap).sort(
+    const sortedSpeakers = Object.keys(speakerIdMap).sort(
       (a, b) => speakerIdMap[a] - speakerIdMap[b],
     );
-    for (let i in sortedSpeakers) {
-      let speaker = sortedSpeakers[i];
-      let option = document.createElement("option");
-      option.text = speaker + " (" + i.toString() + ")";
-      option.value = i.toString();
+    for (const speaker of sortedSpeakers) {
+      const id = speakerIdMap[speaker];
+      const option = document.createElement("option");
+      option.text = `${speaker} (${id})`;
+      option.value = String(id);
       speakerSelect.add(option);
     }
 
-    const selectSpeaker = document.getElementById("divSpeaker");
-    divSpeaker.hidden = false;
+    document.getElementById("divSpeaker").hidden = false;
   }
 
   if (speakerSelect.options.length > 1) {
diff --git a/resources/piper.js b/resources/piper.js
index 2bf034ef..82257380 100644
--- a/resources/piper.js
+++ b/resources/piper.js
@@ -34,7 +34,6 @@ const EOS = "$";
 const PAD = "_";
 
 let espeakInstance = null;
-let espeakInitialized = false;
 let voiceModel = null;
 let voiceConfig = null;
 
@@ -245,26 +244,16 @@ function textToPhonemes(text) {
   const voice = voiceConfig.espeak.voice;
 
   // Set voice
-  const voicePtr = espeakInstance._malloc(
-    espeakInstance.lengthBytesUTF8(voice) + 1,
-  );
-  espeakInstance.stringToUTF8(
-    voice,
-    voicePtr,
-    espeakInstance.lengthBytesUTF8(voice) + 1,
-  );
+  const voiceBytes = espeakInstance.lengthBytesUTF8(voice) + 1;
+  const voicePtr = espeakInstance._malloc(voiceBytes);
+  espeakInstance.stringToUTF8(voice, voicePtr, voiceBytes);
   espeakInstance._espeak_SetVoiceByName(voicePtr);
   espeakInstance._free(voicePtr);
 
   // Prepare text
-  const textPtr = espeakInstance._malloc(
-    espeakInstance.lengthBytesUTF8(text) + 1,
-  );
-  espeakInstance.stringToUTF8(
-    text,
-    textPtr,
-    espeakInstance.lengthBytesUTF8(text) + 1,
-  );
+  const textBytes = espeakInstance.lengthBytesUTF8(text) + 1;
+  const textPtr = espeakInstance._malloc(textBytes);
+  espeakInstance.stringToUTF8(text, textPtr, textBytes);
 
   const textPtrPtr = espeakInstance._malloc(4);
   espeakInstance.setValue(textPtrPtr, textPtr, "*");

From d8b4727826fbb34667bf6c238cb95df3486b0da9 Mon Sep 17 00:00:00 2001
From: Ivan <ivan.zderadicka@gmail.com>
Date: Sat, 13 Jun 2026 11:11:45 +0200
Subject: [PATCH 6/8] better highlighting

---
 demo.html         |  13 ++-
 resources/demo.js | 287 ++++++++++++++++++++++++++--------------------
 2 files changed, 177 insertions(+), 123 deletions(-)

diff --git a/demo.html b/demo.html
index 1287ecb5..77dddc94 100644
--- a/demo.html
+++ b/demo.html
@@ -30,9 +30,20 @@
           background: #fff;
       }
 
+      /* Each sentence is clickable to seek playback there. */
+      #highlightView .sentence {
+          cursor: pointer;
+          border-radius: 2px;
+      }
+
+      /* Lighter tint on hover advertises the click-to-seek affordance; :not(.active) keeps
+         the solid highlight on the sentence that is actually playing. */
+      #highlightView .sentence:hover:not(.active) {
+          background: #fff3c4;
+      }
+
       #highlightView .sentence.active {
           background: #ffe08a;
-          border-radius: 2px;
       }
 
       #logo {
diff --git a/resources/demo.js b/resources/demo.js
index 7a2a0ee2..4c11f314 100644
--- a/resources/demo.js
+++ b/resources/demo.js
@@ -13,35 +13,23 @@ const SENTENCE_GAP_SECONDS = 0.2;
 
 // Web Audio playback state (created lazily on first user gesture, reused after).
 let audioCtx = null;
-// Bumped on each Speak click so an in-flight stream knows to abort.
-let playbackGeneration = 0;
-// Source nodes scheduled for the current playback, so we can stop them on re-click.
-let activeSources = [];
-
-// Live-highlight state. `spans` are the per-sentence elements in the read-only view;
-// `segments[i]` is { startTime, endTime } on the audio clock for spans[i]. The rAF loop
-// matches audioCtx.currentTime against the segments to highlight the playing sentence.
-let highlightSpans = [];
-let highlightSegments = [];
-let highlightRAF = null;
-let activeHighlight = -1;
-// True once the synth loop has scheduled every sentence, so the rAF loop knows it can end
-// when the audio clock passes the last segment (rather than stopping mid-stream).
-let highlightSynthDone = false;
-
-function clearHighlight() {
-  if (activeHighlight >= 0 && highlightSpans[activeHighlight]) {
-    highlightSpans[activeHighlight].classList.remove("active");
-  }
-  activeHighlight = -1;
-}
-
-function stopHighlightLoop() {
-  if (highlightRAF !== null) {
-    cancelAnimationFrame(highlightRAF);
-    highlightRAF = null;
-  }
-}
+// Bumped on Speak/Stop to abort an in-flight synthesis stream. A seek does NOT bump it, so
+// clicking a sentence reschedules playback without killing ongoing synthesis.
+let synthGeneration = 0;
+// True once synthesis has produced every sentence, so the highlight chain knows it may end
+// (revert to the editor) when the audio passes the last sentence rather than mid-stream.
+let synthDone = false;
+// Each sentence's decoded audio, kept index-aligned with the .sentence spans in the view.
+// Retained so a seek can replay without re-synthesizing; never cleared by clearSchedule.
+let sentenceBuffers = [];
+// Audio-clock time the next scheduled source should start at. Per-run scheduling timing and
+// sources live on the spans themselves.
+let nextStartTime = 0;
+// The highlight chain (see armHighlight): the sentence it is about to light, and the single
+// pending setTimeout handle. `highlightTimer === null` means the chain is idle/parked, and is
+// the sole guard against starting a second chain.
+let highlightIndex = 0;
+let highlightTimer = null;
 
 // Read a numeric scale input, returning null when blank/invalid so piper falls back to the
 // voice config default.
@@ -50,20 +38,6 @@ function parseScaleOrNull(input) {
   return isNaN(value) ? null : value;
 }
 
-function stopPlayback() {
-  for (const src of activeSources) {
-    try {
-      src.stop();
-    } catch {
-      // Already stopped/ended.
-    }
-  }
-  activeSources = [];
-  stopHighlightLoop();
-  clearHighlight();
-  highlightSegments = [];
-}
-
 async function main() {
   const fileModel = document.getElementById("fileModel");
   const fileConfig = document.getElementById("fileConfig");
@@ -143,56 +117,134 @@ async function main() {
     textInput.hidden = false;
   }
 
+  // The sentence spans, in document order — index === sentence index === sentenceBuffers
+  // index. The DOM is the list; no separate array is kept.
+  function sentences() {
+    return [...highlightView.querySelectorAll(".sentence")];
+  }
+
   // Reset the read-only view to empty, ready to receive per-sentence spans.
   function resetHighlightView() {
     highlightView.textContent = "";
-    highlightSpans = [];
+  }
+
+  // Tear down the current playback run: cancel the pending highlight timer and stop every
+  // sounding source, and clear each span's per-run timing/highlight. Leaves sentenceBuffers
+  // and the spans themselves intact, so a seek can re-schedule from them. Always nulls
+  // highlightTimer — and the clearTimeout is what makes the chain's captured spans seek-safe
+  // (a seek cancels a pending fire before it can light a now-stale span).
+  function clearSchedule() {
+    if (highlightTimer !== null) {
+      clearTimeout(highlightTimer);
+      highlightTimer = null;
+    }
+    for (const span of sentences()) {
+      if (span.source) {
+        try {
+          span.source.stop();
+        } catch {
+          // Already stopped/ended.
+        }
+        span.source = null;
+      }
+      span.startTime = undefined;
+      span.endTime = undefined;
+      span.classList.remove("active");
+    }
+  }
+
+  // Schedule one sentence to play right after the previously scheduled one, recording its
+  // timing and source on the span, then make sure the highlight chain is running.
+  function scheduleOne(span, buffer) {
+    const source = audioCtx.createBufferSource();
+    source.buffer = buffer;
+    source.connect(audioCtx.destination);
+
+    if (nextStartTime === 0) {
+      nextStartTime = audioCtx.currentTime + 0.1; // small lead-in
+    }
+    // Never schedule in the past: a slow synth yields a gap, not an overlap.
+    nextStartTime = Math.max(nextStartTime, audioCtx.currentTime);
+    source.start(nextStartTime);
+    span.source = source;
+    span.startTime = nextStartTime;
+    span.endTime = nextStartTime + buffer.duration;
+    nextStartTime = span.endTime + SENTENCE_GAP_SECONDS;
+
+    status.innerHTML = "Speaking...";
+    ensureHighlight();
+  }
+
+  // Move the highlight to a span.
+  function setActive(span) {
+    const previous = highlightView.querySelector(".sentence.active");
+    if (previous) {
+      previous.classList.remove("active");
+    }
+    span.classList.add("active");
+    span.scrollIntoView({ block: "nearest" });
+  }
+
+  // Arm the single timer for the next highlight transition, keyed to the audio clock. The
+  // span's scheduled start is in the future, so each delay is re-derived from the live
+  // audioCtx.currentTime — no drift accumulates and inter-sentence gaps are handled because
+  // we fire on the next sentence's start, keeping the previous one lit until then.
+  function armHighlight() {
+    const spans = sentences();
+    const next = spans[highlightIndex];
+    if (next && next.startTime !== undefined) {
+      highlightTimer = setTimeout(
+        () => {
+          setActive(next);
+          highlightIndex++;
+          armHighlight();
+        },
+        Math.max(0, (next.startTime - audioCtx.currentTime) * 1000),
+      );
+    } else if (synthDone) {
+      // Everything is highlighted; revert to the editor after the last sentence ends.
+      const last = spans[spans.length - 1];
+      highlightTimer = setTimeout(
+        finishPlayback,
+        Math.max(0, (last.endTime - audioCtx.currentTime) * 1000),
+      );
+    } else {
+      // Next sentence isn't synthesized yet; park. scheduleOne() re-arms when it arrives.
+      highlightTimer = null;
+    }
+  }
+
+  // Start the highlight chain if it is idle. This `highlightTimer === null` gate is the ONLY
+  // place a chain is started (besides its own self-re-arm), preventing two concurrent chains.
+  function ensureHighlight() {
+    if (highlightTimer === null) {
+      armHighlight();
+    }
+  }
+
+  // Seek: (re)play starting from a given sentence, reusing the retained buffers. Does NOT
+  // bump synthGeneration, so any in-flight synthesis keeps running and its tail appends to
+  // this fresh schedule. Triggered by clicking a sentence.
+  function playFrom(index) {
+    clearSchedule();
+    highlightIndex = index;
+    nextStartTime = 0;
+    const spans = sentences();
+    for (let i = index; i < sentenceBuffers.length; i++) {
+      scheduleOne(spans[i], sentenceBuffers[i]);
+    }
   }
 
   // Final cleanup when playback ends naturally: drop the highlight, return to the editor,
   // and reset the UI to idle.
   function finishPlayback() {
-    clearHighlight();
+    clearSchedule();
     showEditor();
     status.innerHTML = "Ready";
     buttonSpeak.innerHTML = "Speak";
     speaking = false;
   }
 
-  // Poll the audio clock each frame and light up whichever sentence span is currently
-  // playing. BufferSource has no "start" event, so matching audioCtx.currentTime against
-  // the segment table is the reliable trigger, and it self-corrects against scheduling
-  // gaps. Runs until superseded or the audio passes the last scheduled segment.
-  function startHighlightLoop(generation) {
-    const tick = () => {
-      if (generation !== playbackGeneration) {
-        return; // Superseded; stopPlayback already cleaned up.
-      }
-      const t = audioCtx.currentTime;
-      const i = highlightSegments.findIndex(
-        (seg) => t >= seg.startTime && t < seg.endTime,
-      );
-      // Keep the current sentence lit through inter-sentence gaps (i === -1); only switch
-      // when a new sentence actually starts.
-      if (i >= 0 && i !== activeHighlight) {
-        clearHighlight();
-        highlightSpans[i].classList.add("active");
-        highlightSpans[i].scrollIntoView({ block: "nearest" });
-        activeHighlight = i;
-      }
-
-      const last = highlightSegments[highlightSegments.length - 1];
-      if (highlightSynthDone && (!last || t >= last.endTime)) {
-        highlightRAF = null;
-        finishPlayback();
-      } else {
-        highlightRAF = requestAnimationFrame(tick);
-      }
-    };
-    stopHighlightLoop();
-    highlightRAF = requestAnimationFrame(tick);
-  }
-
   async function speak() {
     if (!voiceUrl) {
       alert("Voice model is not set");
@@ -221,9 +273,15 @@ async function main() {
     const noiseScale = parseScaleOrNull(inputNoiseScale);
     const noiseWScale = parseScaleOrNull(inputNoiseWScale);
 
-    // Stop any in-progress playback and mark this as the current generation.
-    const generation = ++playbackGeneration;
-    stopPlayback();
+    // Fresh run: abort any in-flight synthesis (synthGeneration), tear down playback, and
+    // reset the playhead, retained buffers, and view. clearSchedule does not touch
+    // highlightIndex, so reset it here.
+    const generation = ++synthGeneration;
+    clearSchedule();
+    synthDone = false;
+    sentenceBuffers = [];
+    highlightIndex = 0;
+    nextStartTime = 0;
 
     if (!audioCtx) {
       audioCtx = new AudioContext();
@@ -231,15 +289,12 @@ async function main() {
     await audioCtx.resume(); // requires a user gesture, which this click is
 
     const sampleRate = getSampleRate();
-    let nextStartTime = 0;
 
     // Swap the editable textarea for the read-only highlight view, which fills in sentence
     // by sentence as synthesis progresses.
-    highlightSynthDone = false;
     resetHighlightView();
     showHighlightView();
     let viewCursor = 0; // Char offset already emitted into the view.
-    let loopStarted = false;
 
     status.innerHTML = "Synthesizing audio...";
     try {
@@ -250,65 +305,52 @@ async function main() {
         noiseScale,
         noiseWScale,
       )) {
-        // A newer click superseded us while we were synthesizing.
-        if (generation !== playbackGeneration) {
+        // A newer Speak/Stop superseded us while we were synthesizing. (A seek does NOT
+        // bump synthGeneration, so this keeps going across seeks.)
+        if (generation !== synthGeneration) {
           return;
         }
 
         // Append any text between the previous sentence and this one as plain text, then
-        // the sentence itself as a highlightable span. spans and segments stay in lock-step.
+        // the sentence itself as a clickable span (click = seek here). Timing fields start
+        // undefined so the highlight loop never matches an unscheduled span.
         if (start > viewCursor) {
           highlightView.appendChild(
             document.createTextNode(text.slice(viewCursor, start)),
           );
         }
+        const index = sentenceBuffers.length;
         const span = document.createElement("span");
         span.className = "sentence";
         span.textContent = text.slice(start, end);
+        span.startTime = undefined;
+        span.endTime = undefined;
+        span.addEventListener("click", () => playFrom(index));
         highlightView.appendChild(span);
-        highlightSpans.push(span);
         viewCursor = end;
 
-        // Schedule this sentence to play right after the previous one.
+        // Retain the decoded audio and schedule this one sentence onto the current timeline
+        // (streaming appends exactly one; a seek to an earlier sentence is handled by playFrom).
         const buffer = audioCtx.createBuffer(1, audio.length, sampleRate);
         buffer.copyToChannel(audio, 0);
-        const source = audioCtx.createBufferSource();
-        source.buffer = buffer;
-        source.connect(audioCtx.destination);
-
-        if (nextStartTime === 0) {
-          nextStartTime = audioCtx.currentTime + 0.1; // small lead-in
-          status.innerHTML = "Speaking...";
-        }
-        // Never schedule in the past: a slow synth yields a gap, not an overlap.
-        nextStartTime = Math.max(nextStartTime, audioCtx.currentTime);
-        const startedAt = nextStartTime;
-        source.start(startedAt);
-        nextStartTime += buffer.duration + SENTENCE_GAP_SECONDS;
-
-        activeSources.push(source);
-        highlightSegments.push({
-          startTime: startedAt,
-          endTime: startedAt + buffer.duration,
-        });
-
-        if (!loopStarted) {
-          loopStarted = true;
-          startHighlightLoop(generation);
-        }
+        sentenceBuffers.push(buffer);
+        scheduleOne(span, buffer);
       }
     } catch (e) {
       status.innerHTML = "Error while synthesizing";
-      stopPlayback();
+      clearSchedule();
       showEditor();
       throw e;
     }
 
-    // All sentences scheduled. Let the rAF loop end naturally once the audio plays out;
-    // if nothing was produced, there is no loop to restore the editor, so do it here.
-    highlightSynthDone = true;
-    if (!loopStarted) {
+    // All sentences produced. Mark done, THEN kick the highlight chain: if synthesis briefly
+    // lagged playback the chain parked on the last sentence with synthDone still false, and
+    // this is what arms the finish timer so the view reverts. (Empty text scheduled nothing.)
+    synthDone = true;
+    if (sentenceBuffers.length === 0) {
       finishPlayback();
+    } else {
+      ensureHighlight();
     }
   }
 
@@ -327,9 +369,10 @@ async function main() {
         buttonSpeak.innerHTML = "Speak";
       }
     } else {
-      // If the user clicks Stop while we're still speaking, stop immediately.
-      playbackGeneration++;
-      stopPlayback();
+      // If the user clicks Stop while we're still speaking, stop immediately: abort synth
+      // (synthGeneration) and tear down playback (clearSchedule).
+      synthGeneration++;
+      clearSchedule();
       showEditor();
       speaking = false;
       status.innerHTML = "Ready";

From 712f30c3d7e52c437c0cb7cfcc42133326c2eaad Mon Sep 17 00:00:00 2001
From: Ivan <ivan.zderadicka@gmail.com>
Date: Sun, 21 Jun 2026 10:06:32 +0200
Subject: [PATCH 7/8] Final changes

---
 resources/demo.js | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/resources/demo.js b/resources/demo.js
index 4c11f314..67bcd0b2 100644
--- a/resources/demo.js
+++ b/resources/demo.js
@@ -62,6 +62,7 @@ async function main() {
     // Reset config
     voiceConfigUrl = "";
     fileConfig.value = "";
+    divConfig.hidden = true;
     speakerSelect.value = "";
 
     voiceUrl = URL.createObjectURL(file);
@@ -73,19 +74,7 @@ async function main() {
       const voiceConfig = await response.json();
       updateUIForConfig(voiceConfig);
       divConfig.hidden = true;
-
-      if (voiceUrl != loadedVoiceUrl) {
-        status.innerHTML = "Loading voice...";
-        try {
-          await setVoice(voiceUrl, voiceConfigUrl);
-        }
-        catch (e) {
-          status.innerHTML = "Error loading voice";
-          throw e;
-        }
-        loadedVoiceUrl = voiceUrl;
-      }
-
+      await loadVoice();
       status.innerHTML = "Ready";
       buttonSpeak.disabled = false;
     } else {
@@ -103,10 +92,25 @@ async function main() {
     const voiceConfig = JSON.parse(await file.text());
     updateUIForConfig(voiceConfig);
     voiceConfigUrl = URL.createObjectURL(file);
+    await loadVoice();
     status.innerHTML = "Ready";
     buttonSpeak.disabled = false;
   });
 
+  async function loadVoice() {
+    if (voiceUrl != loadedVoiceUrl) {
+      status.innerHTML = "Loading voice...";
+      try {
+        await setVoice(voiceUrl, voiceConfigUrl);
+      }
+      catch (e) {
+        status.innerHTML = "Error loading voice";
+        throw e;
+      }
+      loadedVoiceUrl = voiceUrl;
+    }
+  }
+
   function showHighlightView() {
     textInput.hidden = true;
     highlightView.hidden = false;

From 03142d636275e5c58fd9c01ef6bca86b27ace1b1 Mon Sep 17 00:00:00 2001
From: Ivan <ivan.zderadicka@gmail.com>
Date: Sun, 21 Jun 2026 14:21:05 +0200
Subject: [PATCH 8/8] Update .gitignore

---
 .claude/settings.json | 10 ----------
 .gitignore            |  1 +
 2 files changed, 1 insertion(+), 10 deletions(-)
 delete mode 100644 .claude/settings.json

diff --git a/.claude/settings.json b/.claude/settings.json
deleted file mode 100644
index d8c9779f..00000000
--- a/.claude/settings.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Bash(rm -f /tmp/bytemap_test.mjs; find . -name \"*.onnx\" 2>/dev/null | head; echo \"---configs---\"; ls configs/ 2>/dev/null | head)"
-    ],
-    "additionalDirectories": [
-      "/tmp"
-    ]
-  }
-}
diff --git a/.gitignore b/.gitignore
index 2d6ba342..d560899a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,4 @@ output/
 .python-version
 
 .vscode/
+.claude/