diff --git a/.gitignore b/.gitignore
index c995e60..d560899 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,6 @@ output/
 /install/
 
 .python-version
+
+.vscode/
+.claude/
diff --git a/README.md b/README.md
index e2f0912..487f5ec 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
 # Piper Samples
 
-Samples for [Piper](https://github.com/rhasspy/piper) text to speech system.
+
+Samples for [Piper](https://github.com/OHF-Voice/piper1-gpl) text to speech system.
+
+## Demo
+to run demo `python serve.py` - it provides COOP and COEP headers required for onnx runtime to run with multiple threads.
\ No newline at end of file
diff --git a/demo.html b/demo.html
index e63b939..77dddc9 100644
--- a/demo.html
+++ b/demo.html
@@ -9,8 +9,41 @@
           max-width: 90%;
       }
 
-      #textInput {
+      /* Shared box so the highlight view and the textarea look identical when swapped. */
+      #textInput, #highlightView {
           width: 100%;
+          box-sizing: border-box;
+          font-family: inherit;
+          font-size: 1em;
+          line-height: 1.4;
+          padding: 4px;
+          border: 1px solid #767676;
+          border-radius: 2px;
+      }
+
+      #highlightView {
+          min-height: calc(5 * 1.4em + 8px); /* ~5 rows + padding, matching the textarea */
+          max-height: 40vh;
+          overflow: auto;
+          white-space: pre-wrap;
+          word-wrap: break-word;
+          background: #fff;
+      }
+
+      /* Each sentence is clickable to seek playback there. */
+      #highlightView .sentence {
+          cursor: pointer;
+          border-radius: 2px;
+      }
+
+      /* Lighter tint on hover advertises the click-to-seek affordance; :not(.active) keeps
+         the solid highlight on the sentence that is actually playing. */
+      #highlightView .sentence:hover:not(.active) {
+          background: #fff3c4;
+      }
+
+      #highlightView .sentence.active {
+          background: #ffe08a;
       }
 
       #logo {
@@ -34,11 +67,6 @@
           margin-bottom: 20px;
       }
 
-      #divSpeak > audio {
-          vertical-align: bottom;
-          margin-left: 10px;
-      }
-
       #status {
           margin-left: 10px;
       }
@@ -75,10 +103,10 @@
     </div>
 
     <textarea id="textInput" rows=5 aria-label="Text to speak" disabled></textarea>
+    <div id="highlightView" aria-live="polite" hidden></div>
 
     <div id="divSpeak">
       <button id="buttonSpeak" aria-label="Speak text" disabled>Speak</button>
-      <audio id="audioTTS" aria-label="Spoken audio" controls></audio>
       <span id="status" role="status" aria-live="polite">Ready</span>
     </div>
 
diff --git a/resources/demo.js b/resources/demo.js
index 30d5310..67bcd0b 100644
--- a/resources/demo.js
+++ b/resources/demo.js
@@ -1,24 +1,60 @@
-import { setVoice, textToWavAudio } from "./piper.js";
+import {
+  setVoice,
+  textToAudioSentences,
+  getSampleRate,
+} from "./piper.js";
 
 let voiceUrl = "";
 let loadedVoiceUrl = "";
 let voiceConfigUrl = "";
 
+// Silence inserted between sentences when scheduling live playback. Tune to taste.
+const SENTENCE_GAP_SECONDS = 0.2;
+
+// Web Audio playback state (created lazily on first user gesture, reused after).
+let audioCtx = null;
+// Bumped on Speak/Stop to abort an in-flight synthesis stream. A seek does NOT bump it, so
+// clicking a sentence reschedules playback without killing ongoing synthesis.
+let synthGeneration = 0;
+// True once synthesis has produced every sentence, so the highlight chain knows it may end
+// (revert to the editor) when the audio passes the last sentence rather than mid-stream.
+let synthDone = false;
+// Each sentence's decoded audio, kept index-aligned with the .sentence spans in the view.
+// Retained so a seek can replay without re-synthesizing; never cleared by clearSchedule.
+let sentenceBuffers = [];
+// Audio-clock time the next scheduled source should start at. Per-run scheduling timing and
+// sources live on the spans themselves.
+let nextStartTime = 0;
+// The highlight chain (see armHighlight): the sentence it is about to light, and the single
+// pending setTimeout handle. `highlightTimer === null` means the chain is idle/parked, and is
+// the sole guard against starting a second chain.
+let highlightIndex = 0;
+let highlightTimer = null;
+
+// Read a numeric scale input, returning null when blank/invalid so piper falls back to the
+// voice config default.
+function parseScaleOrNull(input) {
+  const value = parseFloat(input.value);
+  return isNaN(value) ? null : value;
+}
+
 async function main() {
   const fileModel = document.getElementById("fileModel");
   const fileConfig = document.getElementById("fileConfig");
   const divConfig = document.getElementById("divConfig");
   const buttonSpeak = document.getElementById("buttonSpeak");
-  const audioTTS = document.getElementById("audioTTS");
   const textInput = document.getElementById("textInput");
+  const highlightView = document.getElementById("highlightView");
   const status = document.getElementById("status");
   const speakerSelect = document.getElementById("speaker");
   const inputLengthScale = document.getElementById("lengthScale");
   const inputNoiseScale = document.getElementById("noiseScale");
   const inputNoiseWScale = document.getElementById("noiseWScale");
 
-  fileModel.addEventListener("change", async () => {
-    const file = event.target.files[0];
+  let speaking = false;
+
+  fileModel.addEventListener("change", async (e) => {
+    const file = e.target.files[0];
     if (!file) {
       return;
     }
@@ -26,6 +62,7 @@ async function main() {
     // Reset config
     voiceConfigUrl = "";
     fileConfig.value = "";
+    divConfig.hidden = true;
     speakerSelect.value = "";
 
     voiceUrl = URL.createObjectURL(file);
@@ -37,14 +74,17 @@ async function main() {
       const voiceConfig = await response.json();
       updateUIForConfig(voiceConfig);
       divConfig.hidden = true;
+      await loadVoice();
+      status.innerHTML = "Ready";
+      buttonSpeak.disabled = false;
     } else {
       divConfig.hidden = false;
       speakerSelect.hidden = true;
     }
   });
 
-  fileConfig.addEventListener("change", async () => {
-    const file = event.target.files[0];
+  fileConfig.addEventListener("change", async (e) => {
+    const file = e.target.files[0];
     if (!file) {
       return;
     }
@@ -52,15 +92,170 @@ async function main() {
     const voiceConfig = JSON.parse(await file.text());
     updateUIForConfig(voiceConfig);
     voiceConfigUrl = URL.createObjectURL(file);
+    await loadVoice();
+    status.innerHTML = "Ready";
+    buttonSpeak.disabled = false;
   });
 
-  buttonSpeak.addEventListener("click", async () => {
+  async function loadVoice() {
+    if (voiceUrl != loadedVoiceUrl) {
+      status.innerHTML = "Loading voice...";
+      try {
+        await setVoice(voiceUrl, voiceConfigUrl);
+      }
+      catch (e) {
+        status.innerHTML = "Error loading voice";
+        throw e;
+      }
+      loadedVoiceUrl = voiceUrl;
+    }
+  }
+
+  function showHighlightView() {
+    textInput.hidden = true;
+    highlightView.hidden = false;
+  }
+
+  function showEditor() {
+    highlightView.hidden = true;
+    textInput.hidden = false;
+  }
+
+  // The sentence spans, in document order — index === sentence index === sentenceBuffers
+  // index. The DOM is the list; no separate array is kept.
+  function sentences() {
+    return [...highlightView.querySelectorAll(".sentence")];
+  }
+
+  // Reset the read-only view to empty, ready to receive per-sentence spans.
+  function resetHighlightView() {
+    highlightView.textContent = "";
+  }
+
+  // Tear down the current playback run: cancel the pending highlight timer and stop every
+  // sounding source, and clear each span's per-run timing/highlight. Leaves sentenceBuffers
+  // and the spans themselves intact, so a seek can re-schedule from them. Always nulls
+  // highlightTimer — and the clearTimeout is what makes the chain's captured spans seek-safe
+  // (a seek cancels a pending fire before it can light a now-stale span).
+  function clearSchedule() {
+    if (highlightTimer !== null) {
+      clearTimeout(highlightTimer);
+      highlightTimer = null;
+    }
+    for (const span of sentences()) {
+      if (span.source) {
+        try {
+          span.source.stop();
+        } catch {
+          // Already stopped/ended.
+        }
+        span.source = null;
+      }
+      span.startTime = undefined;
+      span.endTime = undefined;
+      span.classList.remove("active");
+    }
+  }
+
+  // Schedule one sentence to play right after the previously scheduled one, recording its
+  // timing and source on the span, then make sure the highlight chain is running.
+  function scheduleOne(span, buffer) {
+    const source = audioCtx.createBufferSource();
+    source.buffer = buffer;
+    source.connect(audioCtx.destination);
+
+    if (nextStartTime === 0) {
+      nextStartTime = audioCtx.currentTime + 0.1; // small lead-in
+    }
+    // Never schedule in the past: a slow synth yields a gap, not an overlap.
+    nextStartTime = Math.max(nextStartTime, audioCtx.currentTime);
+    source.start(nextStartTime);
+    span.source = source;
+    span.startTime = nextStartTime;
+    span.endTime = nextStartTime + buffer.duration;
+    nextStartTime = span.endTime + SENTENCE_GAP_SECONDS;
+
+    status.innerHTML = "Speaking...";
+    ensureHighlight();
+  }
+
+  // Move the highlight to a span.
+  function setActive(span) {
+    const previous = highlightView.querySelector(".sentence.active");
+    if (previous) {
+      previous.classList.remove("active");
+    }
+    span.classList.add("active");
+    span.scrollIntoView({ block: "nearest" });
+  }
+
+  // Arm the single timer for the next highlight transition, keyed to the audio clock. The
+  // span's scheduled start is in the future, so each delay is re-derived from the live
+  // audioCtx.currentTime — no drift accumulates and inter-sentence gaps are handled because
+  // we fire on the next sentence's start, keeping the previous one lit until then.
+  function armHighlight() {
+    const spans = sentences();
+    const next = spans[highlightIndex];
+    if (next && next.startTime !== undefined) {
+      highlightTimer = setTimeout(
+        () => {
+          setActive(next);
+          highlightIndex++;
+          armHighlight();
+        },
+        Math.max(0, (next.startTime - audioCtx.currentTime) * 1000),
+      );
+    } else if (synthDone) {
+      // Everything is highlighted; revert to the editor after the last sentence ends.
+      const last = spans[spans.length - 1];
+      highlightTimer = setTimeout(
+        finishPlayback,
+        Math.max(0, (last.endTime - audioCtx.currentTime) * 1000),
+      );
+    } else {
+      // Next sentence isn't synthesized yet; park. scheduleOne() re-arms when it arrives.
+      highlightTimer = null;
+    }
+  }
+
+  // Start the highlight chain if it is idle. This `highlightTimer === null` gate is the ONLY
+  // place a chain is started (besides its own self-re-arm), preventing two concurrent chains.
+  function ensureHighlight() {
+    if (highlightTimer === null) {
+      armHighlight();
+    }
+  }
+
+  // Seek: (re)play starting from a given sentence, reusing the retained buffers. Does NOT
+  // bump synthGeneration, so any in-flight synthesis keeps running and its tail appends to
+  // this fresh schedule. Triggered by clicking a sentence.
+  function playFrom(index) {
+    clearSchedule();
+    highlightIndex = index;
+    nextStartTime = 0;
+    const spans = sentences();
+    for (let i = index; i < sentenceBuffers.length; i++) {
+      scheduleOne(spans[i], sentenceBuffers[i]);
+    }
+  }
+
+  // Final cleanup when playback ends naturally: drop the highlight, return to the editor,
+  // and reset the UI to idle.
+  function finishPlayback() {
+    clearSchedule();
+    showEditor();
+    status.innerHTML = "Ready";
+    buttonSpeak.innerHTML = "Speak";
+    speaking = false;
+  }
+
+  async function speak() {
     if (!voiceUrl) {
       alert("Voice model is not set");
       return;
     }
 
-    if (!voiceConfigUrl) {
+    if (!loadedVoiceUrl) {
       alert("Voice config is not set");
       return;
     }
@@ -71,50 +266,127 @@ async function main() {
       return;
     }
 
-    if (voiceUrl != loadedVoiceUrl) {
-      status.innerHTML = "Loading voice...";
-      await setVoice(voiceUrl, voiceConfigUrl);
-      loadedVoiceUrl = voiceUrl;
-    }
+
 
     let speakerId = null;
     if (speakerSelect.selectedIndex > 0) {
       speakerId = parseInt(speakerSelect.value);
     }
 
-    let lengthScale = parseFloat(inputLengthScale.value);
-    if (isNaN(lengthScale)) {
-      lengthScale = null;
-    }
+    const lengthScale = parseScaleOrNull(inputLengthScale);
+    const noiseScale = parseScaleOrNull(inputNoiseScale);
+    const noiseWScale = parseScaleOrNull(inputNoiseWScale);
 
-    let noiseScale = parseFloat(inputNoiseScale.value);
-    if (isNaN(noiseScale)) {
-      noiseScale = null;
-    }
+    // Fresh run: abort any in-flight synthesis (synthGeneration), tear down playback, and
+    // reset the playhead, retained buffers, and view. clearSchedule does not touch
+    // highlightIndex, so reset it here.
+    const generation = ++synthGeneration;
+    clearSchedule();
+    synthDone = false;
+    sentenceBuffers = [];
+    highlightIndex = 0;
+    nextStartTime = 0;
 
-    let noiseWScale = parseFloat(inputNoiseWScale.value);
-    if (isNaN(noiseWScale)) {
-      noiseWScale = null;
+    if (!audioCtx) {
+      audioCtx = new AudioContext();
     }
+    await audioCtx.resume(); // requires a user gesture, which this click is
+
+    const sampleRate = getSampleRate();
+
+    // Swap the editable textarea for the read-only highlight view, which fills in sentence
+    // by sentence as synthesis progresses.
+    resetHighlightView();
+    showHighlightView();
+    let viewCursor = 0; // Char offset already emitted into the view.
 
     status.innerHTML = "Synthesizing audio...";
-    const wavAudio = await textToWavAudio(
-      text,
-      speakerId,
-      lengthScale,
-      noiseScale,
-      noiseWScale,
-    );
-    const audioURL = URL.createObjectURL(wavAudio);
+    try {
+      for await (const { audio, start, end } of textToAudioSentences(
+        text,
+        speakerId,
+        lengthScale,
+        noiseScale,
+        noiseWScale,
+      )) {
+        // A newer Speak/Stop superseded us while we were synthesizing. (A seek does NOT
+        // bump synthGeneration, so this keeps going across seeks.)
+        if (generation !== synthGeneration) {
+          return;
+        }
 
-    audioTTS.src = audioURL;
-    audioTTS.play();
+        // Append any text between the previous sentence and this one as plain text, then
+        // the sentence itself as a clickable span (click = seek here). Timing fields start
+        // undefined so the highlight loop never matches an unscheduled span.
+        if (start > viewCursor) {
+          highlightView.appendChild(
+            document.createTextNode(text.slice(viewCursor, start)),
+          );
+        }
+        const index = sentenceBuffers.length;
+        const span = document.createElement("span");
+        span.className = "sentence";
+        span.textContent = text.slice(start, end);
+        span.startTime = undefined;
+        span.endTime = undefined;
+        span.addEventListener("click", () => playFrom(index));
+        highlightView.appendChild(span);
+        viewCursor = end;
 
-    status.innerHTML = "Ready";
+        // Retain the decoded audio and schedule this one sentence onto the current timeline
+        // (streaming appends exactly one; a seek to an earlier sentence is handled by playFrom).
+        const buffer = audioCtx.createBuffer(1, audio.length, sampleRate);
+        buffer.copyToChannel(audio, 0);
+        sentenceBuffers.push(buffer);
+        scheduleOne(span, buffer);
+      }
+    } catch (e) {
+      status.innerHTML = "Error while synthesizing";
+      clearSchedule();
+      showEditor();
+      throw e;
+    }
+
+    // All sentences produced. Mark done, THEN kick the highlight chain: if synthesis briefly
+    // lagged playback the chain parked on the last sentence with synthDone still false, and
+    // this is what arms the finish timer so the view reverts. (Empty text scheduled nothing.)
+    synthDone = true;
+    if (sentenceBuffers.length === 0) {
+      finishPlayback();
+    } else {
+      ensureHighlight();
+    }
+  }
+
+
+  buttonSpeak.addEventListener("click", async () => {
+    if (!speaking) {
+      speaking = true;
+      buttonSpeak.innerHTML = "Stop";
+      try {
+        // Stays "speaking" through playback; finishPlayback() resets the UI when the
+        // audio plays out. speak() resolves once synthesis is scheduled, not when audio ends.
+        await speak();
+      } catch {
+        // speak() already restored the editor and set an error status.
+        speaking = false;
+        buttonSpeak.innerHTML = "Speak";
+      }
+    } else {
+      // If the user clicks Stop while we're still speaking, stop immediately: abort synth
+      // (synthGeneration) and tear down playback (clearSchedule).
+      synthGeneration++;
+      clearSchedule();
+      showEditor();
+      speaking = false;
+      status.innerHTML = "Ready";
+      buttonSpeak.innerHTML = "Speak";
+    }
   });
 
   textInput.disabled = false;
-  buttonSpeak.disabled = false;
+  buttonSpeak.disabled = true;
+  status.innerHTML = "Load voice to begin";
   fileModel.value = "";
   fileConfig.value = "";
 }
@@ -138,19 +410,18 @@ function updateUIForConfig(voiceConfig) {
   } else {
     // Multi-speaker model
     const speakerIdMap = voiceConfig.speaker_id_map;
-    let sortedSpeakers = Object.keys(speakerIdMap).sort(
+    const sortedSpeakers = Object.keys(speakerIdMap).sort(
       (a, b) => speakerIdMap[a] - speakerIdMap[b],
     );
-    for (let i in sortedSpeakers) {
-      let speaker = sortedSpeakers[i];
-      let option = document.createElement("option");
-      option.text = speaker + " (" + i.toString() + ")";
-      option.value = i.toString();
+    for (const speaker of sortedSpeakers) {
+      const id = speakerIdMap[speaker];
+      const option = document.createElement("option");
+      option.text = `${speaker} (${id})`;
+      option.value = String(id);
       speakerSelect.add(option);
     }
 
-    const selectSpeaker = document.getElementById("divSpeaker");
-    divSpeaker.hidden = false;
+    document.getElementById("divSpeaker").hidden = false;
   }
 
   if (speakerSelect.options.length > 1) {
diff --git a/resources/piper.js b/resources/piper.js
index 82ebdb1..8225738 100644
--- a/resources/piper.js
+++ b/resources/piper.js
@@ -2,6 +2,14 @@
 
 import EspeakModule from "./espeakng.worker.js";
 
+// Run onnxruntime inference in a Web Worker so it doesn't block the UI thread.
+ort.env.wasm.proxy = true;
+
+// Use multiple threads for inference. This only takes effect when the page is
+// cross-origin isolated (COOP + COEP headers -> SharedArrayBuffer available);
+// otherwise onnxruntime-web silently falls back to a single thread. See serve.py.
+ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4;
+
 const AUDIO_OUTPUT_SYNCHRONOUS = 2;
 const espeakCHARS_AUTO = 0;
 
@@ -26,7 +34,6 @@ const EOS = "$";
 const PAD = "_";
 
 let espeakInstance = null;
-let espeakInitialized = false;
 let voiceModel = null;
 let voiceConfig = null;
 
@@ -49,29 +56,90 @@ async function setVoice(voiceModelUrl, voiceConfigUrl = undefined) {
   voiceModel = await ort.InferenceSession.create(voiceModelUrl);
 }
 
+function getSampleRate() {
+  if (!voiceConfig) {
+    throw new Error("Voice is not set");
+  }
+  return voiceConfig.audio.sample_rate;
+}
+
+// Resolve scale arguments, falling back to the voice config defaults.
+function resolveScales(lengthScale, noiseScale, noiseWScale) {
+  return {
+    lengthScale: lengthScale ?? voiceConfig.inference.length_scale ?? 1.0,
+    noiseScale: noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667,
+    noiseWScale: noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8,
+  };
+}
+
+// Run the ONNX model on a single utterance's phoneme ids, returning Float32 PCM.
+async function synthesizeIds(
+  phonemeIds,
+  speakerId,
+  lengthScale,
+  noiseScale,
+  noiseWScale,
+) {
+  const phonemeIdsTensor = new ort.Tensor(
+    "int64",
+    new BigInt64Array(phonemeIds.map((x) => BigInt(x))),
+    [1, phonemeIds.length],
+  );
+  const phonemeLengthsTensor = new ort.Tensor(
+    "int64",
+    BigInt64Array.from([BigInt(phonemeIds.length)]),
+    [1],
+  );
+  const scalesTensor = new ort.Tensor(
+    "float32",
+    Float32Array.from([noiseScale, lengthScale, noiseWScale]),
+    [3],
+  );
+
+  let feeds = {
+    input: phonemeIdsTensor,
+    input_lengths: phonemeLengthsTensor,
+    scales: scalesTensor,
+  };
+
+  if (voiceConfig.num_speakers > 1) {
+    // Multi-speaker
+    feeds["sid"] = new ort.Tensor(
+      "int64",
+      BigInt64Array.from([BigInt(speakerId ?? 0)]),
+    );
+  }
+
+  const results = await voiceModel.run(feeds);
+  return results.output.cpuData;
+}
+
+// Currently unused by the demo (kept for the public API; the demo streams via
+// textToAudioSentences instead).
 async function textToWavAudio(
   text,
   speakerId = undefined,
-  noiseScale = undefined,
   lengthScale = undefined,
+  noiseScale = undefined,
   noiseWScale = undefined,
 ) {
   if (!voiceConfig) {
     throw new Error("Voice is not set");
   }
 
-  const sampleRate = voiceConfig.audio.sample_rate;
   const float32Audio = await textToFloat32Audio(
     text,
     speakerId,
-    noiseScale,
     lengthScale,
+    noiseScale,
     noiseWScale,
   );
 
-  return float32ToWavBlob(float32Audio, sampleRate);
+  return float32ToWavBlob(float32Audio, getSampleRate());
 }
 
+// Currently unused by the demo (kept for the public API; the demo streams via
+// textToAudioSentences instead).
 async function textToFloat32Audio(
   text,
   speakerId = undefined,
@@ -83,62 +151,90 @@ async function textToFloat32Audio(
     throw new Error("Voice is not set");
   }
 
-  lengthScale = lengthScale ?? voiceConfig.inference.length_scale ?? 1.0;
-  noiseScale = noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667;
-  noiseWScale = noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8;
+  const scales = resolveScales(lengthScale, noiseScale, noiseWScale);
 
-  if (voiceConfig.num_speakers > 1) {
-    speakerId = speakerId ?? 0; // first speaker
-  }
-
-  const textPhonemes = textToPhonemes(text);
+  const textPhonemes = textToPhonemes(text).map((segment) => segment.phonemes);
   const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes);
 
-  // Run onnx model
-  const phonemeIdsTensor = new ort.Tensor(
-    "int64",
-    new BigInt64Array(phonemeIds.map((x) => BigInt(x))),
-    [1, phonemeIds.length],
-  );
-  const phonemeLengthsTensor = new ort.Tensor(
-    "int64",
-    BigInt64Array.from([BigInt(phonemeIds.length)]),
-    [1],
-  );
-  const scalesTensor = new ort.Tensor(
-    "float32",
-    Float32Array.from([noiseScale, lengthScale, noiseWScale]),
-    [3],
+  return synthesizeIds(
+    phonemeIds,
+    speakerId,
+    scales.lengthScale,
+    scales.noiseScale,
+    scales.noiseWScale,
   );
+}
 
-  let feeds = {
-    input: phonemeIdsTensor,
-    input_lengths: phonemeLengthsTensor,
-    scales: scalesTensor,
-  };
+// Synthesize a sentence at a time, yielding Float32 PCM for each as soon as it is
+// ready. Lets the caller start playing early instead of waiting for the whole text.
+async function* textToAudioSentences(
+  text,
+  speakerId = undefined,
+  lengthScale = undefined,
+  noiseScale = undefined,
+  noiseWScale = undefined,
+) {
+  if (!voiceConfig) {
+    throw new Error("Voice is not set");
+  }
 
-  if (voiceConfig.num_speakers > 1) {
-    // Multi-speaker
-    feeds["sid"] = new ort.Tensor(
-      "int64",
-      BigInt64Array.from([BigInt(speakerId)]),
+  const scales = resolveScales(lengthScale, noiseScale, noiseWScale);
+
+  // textToPhonemes already segments into per-sentence { phonemes, start, end }.
+  const sentences = textToPhonemes(text);
+
+  for (const sentence of sentences) {
+    const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, [sentence.phonemes]);
+    const audio = await synthesizeIds(
+      phonemeIds,
+      speakerId,
+      scales.lengthScale,
+      scales.noiseScale,
+      scales.noiseWScale,
     );
+    // start/end are character indices into `text`, so the caller can highlight the slice
+    // this audio was synthesized from.
+    yield { audio, start: sentence.start, end: sentence.end };
   }
+}
 
-  const results = await voiceModel.run(feeds);
-  const float32Audio = results.output.cpuData;
+function utf8ByteLength(codePoint) {
+  if (codePoint <= 0x7f) return 1;
+  if (codePoint <= 0x7ff) return 2;
+  if (codePoint <= 0xffff) return 3;
+  return 4;
+}
 
-  return float32Audio;
+// espeak reports clause boundaries as UTF-8 byte offsets, but the displayed text is
+// indexed in JS string units. Those offsets only ever move forward, so we translate them
+// with a single forward-walking cursor (no lookup table): each call advances through the
+// string until it reaches the requested byte offset and returns the character index there.
+function makeByteToCharCursor(text) {
+  let byte = 0;
+  let char = 0; // JS string index == character index (surrogate pairs count as 2).
+  return (targetByte) => {
+    while (byte < targetByte && char < text.length) {
+      const codePoint = text.codePointAt(char);
+      byte += utf8ByteLength(codePoint);
+      // Advance one whole character: astral code points are a surrogate pair, so they
+      // occupy two UTF-16 string indices; everything in the BMP occupies one.
+      char += codePoint > 0xffff ? 2 : 1;
+    }
+    return char;
+  };
 }
 
+// Segment text into per-sentence units. Returns an array of
+// { phonemes, start, end } where start/end are character indices into the original
+// `text`, identifying the slice each sentence was synthesized from.
 function textToPhonemes(text) {
   if (!voiceConfig) {
     throw new Error("Voice is not set");
   }
 
   if (voiceConfig.phoneme_type == "text") {
-    // Text phonemes
-    return [Array.from(text.normalize("NFD"))];
+    // Text phonemes: the whole text is a single sentence.
+    return [{ phonemes: Array.from(text.normalize("NFD")), start: 0, end: text.length }];
   }
 
   if (!espeakInstance) {
@@ -148,26 +244,16 @@ function textToPhonemes(text) {
   const voice = voiceConfig.espeak.voice;
 
   // Set voice
-  const voicePtr = espeakInstance._malloc(
-    espeakInstance.lengthBytesUTF8(voice) + 1,
-  );
-  espeakInstance.stringToUTF8(
-    voice,
-    voicePtr,
-    espeakInstance.lengthBytesUTF8(voice) + 1,
-  );
+  const voiceBytes = espeakInstance.lengthBytesUTF8(voice) + 1;
+  const voicePtr = espeakInstance._malloc(voiceBytes);
+  espeakInstance.stringToUTF8(voice, voicePtr, voiceBytes);
   espeakInstance._espeak_SetVoiceByName(voicePtr);
   espeakInstance._free(voicePtr);
 
   // Prepare text
-  const textPtr = espeakInstance._malloc(
-    espeakInstance.lengthBytesUTF8(text) + 1,
-  );
-  espeakInstance.stringToUTF8(
-    text,
-    textPtr,
-    espeakInstance.lengthBytesUTF8(text) + 1,
-  );
+  const textBytes = espeakInstance.lengthBytesUTF8(text) + 1;
+  const textPtr = espeakInstance._malloc(textBytes);
+  espeakInstance.stringToUTF8(text, textPtr, textBytes);
 
   const textPtrPtr = espeakInstance._malloc(4);
   espeakInstance.setValue(textPtrPtr, textPtr, "*");
@@ -175,13 +261,27 @@ function textToPhonemes(text) {
   // End of clause and sentences
   const terminatorPtr = espeakInstance._malloc(4);
 
-  // Phoneme lists for each sentence
+  // Translates espeak's UTF-8 byte offsets to character indices into the original `text`
+  // so they can slice/highlight it directly.
+  const toChar = makeByteToCharCursor(text);
+
+  // Sentence segments, each { phonemes, start, end } in character indices.
   const textPhonemes = [];
 
   // Phoneme list for current sentence
   let sentencePhonemes = [];
 
+  // Character offsets: where the next clause begins, and where the current sentence
+  // (accumulation of clauses) began.
+  let cursorChar = 0;
+  let sentenceStartChar = 0;
+
   while (true) {
+    // A new sentence is starting if we haven't accumulated any clauses for it yet.
+    if (sentencePhonemes.length === 0) {
+      sentenceStartChar = cursorChar;
+    }
+
     const phonemesPtr = espeakInstance._espeak_TextToPhonemesWithTerminator(
       textPtrPtr,
       espeakCHARS_AUTO,
@@ -209,13 +309,28 @@ function textToPhonemes(text) {
       sentencePhonemes.push("; ");
     }
 
+    // Where espeak will resume. 0 means the input is exhausted (this clause runs to the
+    // end of the text). Otherwise espeak reads one lookahead character past the clause
+    // boundary, so its resume offset overshoots the true boundary by exactly one
+    // character — subtract it back off (in character space) to land on the start of the
+    // next clause.
+    const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*");
+    const endChar =
+      nextTextPtr === 0
+        ? text.length
+        : Math.max(cursorChar, toChar(nextTextPtr - textPtr) - 1);
+    cursorChar = endChar;
+
     if ((terminator & CLAUSE_TYPE_SENTENCE) === CLAUSE_TYPE_SENTENCE) {
       // End of sentence
-      textPhonemes.push(sentencePhonemes);
+      textPhonemes.push({
+        phonemes: sentencePhonemes,
+        start: sentenceStartChar,
+        end: endChar,
+      });
       sentencePhonemes = [];
     }
 
-    const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*");
     if (nextTextPtr === 0) {
       break; // All text processed
     }
@@ -231,16 +346,20 @@ function textToPhonemes(text) {
 
   // Add lingering phonemes
   if (sentencePhonemes.length > 0) {
-    textPhonemes.push(sentencePhonemes);
+    textPhonemes.push({
+      phonemes: sentencePhonemes,
+      start: sentenceStartChar,
+      end: text.length,
+    });
     sentencePhonemes = [];
   }
 
-  // Prepare phonemes for Piper
-  for (let i = 0; i < textPhonemes.length; i++) {
-    textPhonemes[i] = Array.from(textPhonemes[i].join("").normalize("NFD"));
-  }
-
-  return textPhonemes;
+  // Prepare phonemes for Piper; start/end are already character indices into `text`.
+  return textPhonemes.map((segment) => ({
+    phonemes: Array.from(segment.phonemes.join("").normalize("NFD")),
+    start: segment.start,
+    end: segment.end,
+  }));
 }
 
 function phonemesToIds(idMap, textPhonemes) {
@@ -300,4 +419,11 @@ function float32ToWavBlob(floatArray, sampleRate) {
   return new Blob([view], { type: "audio/wav" });
 }
 
-export { setVoice, textToWavAudio, textToFloat32Audio };
+export {
+  setVoice,
+  textToWavAudio,
+  textToFloat32Audio,
+  textToAudioSentences,
+  float32ToWavBlob,
+  getSampleRate,
+};
diff --git a/serve.py b/serve.py
new file mode 100644
index 0000000..b835ca5
--- /dev/null
+++ b/serve.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""Static server for the Piper demo with cross-origin isolation enabled.
+
+onnxruntime-web can only run multi-threaded WASM (using SharedArrayBuffer) when
+the page is "cross-origin isolated". That requires two response headers that the
+stock `python -m http.server` does not send:
+
+    Cross-Origin-Opener-Policy: same-origin
+    Cross-Origin-Embedder-Policy: credentialless
+
+We use `credentialless` rather than `require-corp`: both enable cross-origin
+isolation (and thus SharedArrayBuffer / threads), but `credentialless` still lets
+no-cors cross-origin assets load (e.g. the sponsor badge), instead of blocking
+anything that doesn't send a CORP header. Supported in Firefox 119+ / Chrome 110+.
+
+Run this from the demo directory:
+
+    python serve.py            # serves on http://localhost:8080
+    python serve.py 8080       # custom port
+
+After loading the page, confirm in the browser console:
+
+    crossOriginIsolated === true
+
+If that is false, the headers are not reaching the browser and inference will
+stay single-threaded.
+"""
+
+import sys
+from functools import partial
+from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+
+
+class CrossOriginIsolatedHandler(SimpleHTTPRequestHandler):
+    def end_headers(self):
+        self.send_header("Cross-Origin-Opener-Policy", "same-origin")
+        self.send_header("Cross-Origin-Embedder-Policy", "credentialless")
+        # Avoid stale assets while developing.
+        self.send_header("Cache-Control", "no-store")
+        super().end_headers()
+
+
+def main():
+    port = int(sys.argv[1]) if len(sys.argv) > 1 else 8080
+    handler = partial(CrossOriginIsolatedHandler, directory=".")
+    with ThreadingHTTPServer(("0.0.0.0", port), handler) as httpd:
+        print(f"Serving cross-origin-isolated demo on http://localhost:{port}")
+        print("Confirm `crossOriginIsolated === true` in the browser console.")
+        try:
+            httpd.serve_forever()
+        except KeyboardInterrupt:
+            print("\nStopped.")
+
+
+if __name__ == "__main__":
+    main()