diff --git a/.gitignore b/.gitignore
index c995e60..d560899 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,6 @@ output/
/install/
.python-version
+
+.vscode/
+.claude/
diff --git a/README.md b/README.md
index e2f0912..487f5ec 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
# Piper Samples
-Samples for [Piper](https://github.com/rhasspy/piper) text to speech system.
+
+Samples for [Piper](https://github.com/OHF-Voice/piper1-gpl) text to speech system.
+
+## Demo
+to run demo `python serve.py` - it provides COOP and COEP headers required for onnx runtime to run with multiple threads.
\ No newline at end of file
diff --git a/demo.html b/demo.html
index e63b939..77dddc9 100644
--- a/demo.html
+++ b/demo.html
@@ -9,8 +9,41 @@
max-width: 90%;
}
- #textInput {
+ /* Shared box so the highlight view and the textarea look identical when swapped. */
+ #textInput, #highlightView {
width: 100%;
+ box-sizing: border-box;
+ font-family: inherit;
+ font-size: 1em;
+ line-height: 1.4;
+ padding: 4px;
+ border: 1px solid #767676;
+ border-radius: 2px;
+ }
+
+ #highlightView {
+ min-height: calc(5 * 1.4em + 8px); /* ~5 rows + padding, matching the textarea */
+ max-height: 40vh;
+ overflow: auto;
+ white-space: pre-wrap;
+ word-wrap: break-word;
+ background: #fff;
+ }
+
+ /* Each sentence is clickable to seek playback there. */
+ #highlightView .sentence {
+ cursor: pointer;
+ border-radius: 2px;
+ }
+
+ /* Lighter tint on hover advertises the click-to-seek affordance; :not(.active) keeps
+ the solid highlight on the sentence that is actually playing. */
+ #highlightView .sentence:hover:not(.active) {
+ background: #fff3c4;
+ }
+
+ #highlightView .sentence.active {
+ background: #ffe08a;
}
#logo {
@@ -34,11 +67,6 @@
margin-bottom: 20px;
}
- #divSpeak > audio {
- vertical-align: bottom;
- margin-left: 10px;
- }
-
#status {
margin-left: 10px;
}
@@ -75,10 +103,10 @@
+
diff --git a/resources/demo.js b/resources/demo.js
index 30d5310..67bcd0b 100644
--- a/resources/demo.js
+++ b/resources/demo.js
@@ -1,24 +1,60 @@
-import { setVoice, textToWavAudio } from "./piper.js";
+import {
+ setVoice,
+ textToAudioSentences,
+ getSampleRate,
+} from "./piper.js";
let voiceUrl = "";
let loadedVoiceUrl = "";
let voiceConfigUrl = "";
+// Silence inserted between sentences when scheduling live playback. Tune to taste.
+const SENTENCE_GAP_SECONDS = 0.2;
+
+// Web Audio playback state (created lazily on first user gesture, reused after).
+let audioCtx = null;
+// Bumped on Speak/Stop to abort an in-flight synthesis stream. A seek does NOT bump it, so
+// clicking a sentence reschedules playback without killing ongoing synthesis.
+let synthGeneration = 0;
+// True once synthesis has produced every sentence, so the highlight chain knows it may end
+// (revert to the editor) when the audio passes the last sentence rather than mid-stream.
+let synthDone = false;
+// Each sentence's decoded audio, kept index-aligned with the .sentence spans in the view.
+// Retained so a seek can replay without re-synthesizing; never cleared by clearSchedule.
+let sentenceBuffers = [];
+// Audio-clock time the next scheduled source should start at. Per-run scheduling timing and
+// sources live on the spans themselves.
+let nextStartTime = 0;
+// The highlight chain (see armHighlight): the sentence it is about to light, and the single
+// pending setTimeout handle. `highlightTimer === null` means the chain is idle/parked, and is
+// the sole guard against starting a second chain.
+let highlightIndex = 0;
+let highlightTimer = null;
+
+// Read a numeric scale input, returning null when blank/invalid so piper falls back to the
+// voice config default.
+function parseScaleOrNull(input) {
+ const value = parseFloat(input.value);
+ return isNaN(value) ? null : value;
+}
+
async function main() {
const fileModel = document.getElementById("fileModel");
const fileConfig = document.getElementById("fileConfig");
const divConfig = document.getElementById("divConfig");
const buttonSpeak = document.getElementById("buttonSpeak");
- const audioTTS = document.getElementById("audioTTS");
const textInput = document.getElementById("textInput");
+ const highlightView = document.getElementById("highlightView");
const status = document.getElementById("status");
const speakerSelect = document.getElementById("speaker");
const inputLengthScale = document.getElementById("lengthScale");
const inputNoiseScale = document.getElementById("noiseScale");
const inputNoiseWScale = document.getElementById("noiseWScale");
- fileModel.addEventListener("change", async () => {
- const file = event.target.files[0];
+ let speaking = false;
+
+ fileModel.addEventListener("change", async (e) => {
+ const file = e.target.files[0];
if (!file) {
return;
}
@@ -26,6 +62,7 @@ async function main() {
// Reset config
voiceConfigUrl = "";
fileConfig.value = "";
+ divConfig.hidden = true;
speakerSelect.value = "";
voiceUrl = URL.createObjectURL(file);
@@ -37,14 +74,17 @@ async function main() {
const voiceConfig = await response.json();
updateUIForConfig(voiceConfig);
divConfig.hidden = true;
+ await loadVoice();
+ status.innerHTML = "Ready";
+ buttonSpeak.disabled = false;
} else {
divConfig.hidden = false;
speakerSelect.hidden = true;
}
});
- fileConfig.addEventListener("change", async () => {
- const file = event.target.files[0];
+ fileConfig.addEventListener("change", async (e) => {
+ const file = e.target.files[0];
if (!file) {
return;
}
@@ -52,15 +92,170 @@ async function main() {
const voiceConfig = JSON.parse(await file.text());
updateUIForConfig(voiceConfig);
voiceConfigUrl = URL.createObjectURL(file);
+ await loadVoice();
+ status.innerHTML = "Ready";
+ buttonSpeak.disabled = false;
});
- buttonSpeak.addEventListener("click", async () => {
+ async function loadVoice() {
+ if (voiceUrl != loadedVoiceUrl) {
+ status.innerHTML = "Loading voice...";
+ try {
+ await setVoice(voiceUrl, voiceConfigUrl);
+ }
+ catch (e) {
+ status.innerHTML = "Error loading voice";
+ throw e;
+ }
+ loadedVoiceUrl = voiceUrl;
+ }
+ }
+
+ function showHighlightView() {
+ textInput.hidden = true;
+ highlightView.hidden = false;
+ }
+
+ function showEditor() {
+ highlightView.hidden = true;
+ textInput.hidden = false;
+ }
+
+ // The sentence spans, in document order — index === sentence index === sentenceBuffers
+ // index. The DOM is the list; no separate array is kept.
+ function sentences() {
+ return [...highlightView.querySelectorAll(".sentence")];
+ }
+
+ // Reset the read-only view to empty, ready to receive per-sentence spans.
+ function resetHighlightView() {
+ highlightView.textContent = "";
+ }
+
+ // Tear down the current playback run: cancel the pending highlight timer and stop every
+ // sounding source, and clear each span's per-run timing/highlight. Leaves sentenceBuffers
+ // and the spans themselves intact, so a seek can re-schedule from them. Always nulls
+ // highlightTimer — and the clearTimeout is what makes the chain's captured spans seek-safe
+ // (a seek cancels a pending fire before it can light a now-stale span).
+ function clearSchedule() {
+ if (highlightTimer !== null) {
+ clearTimeout(highlightTimer);
+ highlightTimer = null;
+ }
+ for (const span of sentences()) {
+ if (span.source) {
+ try {
+ span.source.stop();
+ } catch {
+ // Already stopped/ended.
+ }
+ span.source = null;
+ }
+ span.startTime = undefined;
+ span.endTime = undefined;
+ span.classList.remove("active");
+ }
+ }
+
+ // Schedule one sentence to play right after the previously scheduled one, recording its
+ // timing and source on the span, then make sure the highlight chain is running.
+ function scheduleOne(span, buffer) {
+ const source = audioCtx.createBufferSource();
+ source.buffer = buffer;
+ source.connect(audioCtx.destination);
+
+ if (nextStartTime === 0) {
+ nextStartTime = audioCtx.currentTime + 0.1; // small lead-in
+ }
+ // Never schedule in the past: a slow synth yields a gap, not an overlap.
+ nextStartTime = Math.max(nextStartTime, audioCtx.currentTime);
+ source.start(nextStartTime);
+ span.source = source;
+ span.startTime = nextStartTime;
+ span.endTime = nextStartTime + buffer.duration;
+ nextStartTime = span.endTime + SENTENCE_GAP_SECONDS;
+
+ status.innerHTML = "Speaking...";
+ ensureHighlight();
+ }
+
+ // Move the highlight to a span.
+ function setActive(span) {
+ const previous = highlightView.querySelector(".sentence.active");
+ if (previous) {
+ previous.classList.remove("active");
+ }
+ span.classList.add("active");
+ span.scrollIntoView({ block: "nearest" });
+ }
+
+ // Arm the single timer for the next highlight transition, keyed to the audio clock. The
+ // span's scheduled start is in the future, so each delay is re-derived from the live
+ // audioCtx.currentTime — no drift accumulates and inter-sentence gaps are handled because
+ // we fire on the next sentence's start, keeping the previous one lit until then.
+ function armHighlight() {
+ const spans = sentences();
+ const next = spans[highlightIndex];
+ if (next && next.startTime !== undefined) {
+ highlightTimer = setTimeout(
+ () => {
+ setActive(next);
+ highlightIndex++;
+ armHighlight();
+ },
+ Math.max(0, (next.startTime - audioCtx.currentTime) * 1000),
+ );
+ } else if (synthDone) {
+ // Everything is highlighted; revert to the editor after the last sentence ends.
+ const last = spans[spans.length - 1];
+ highlightTimer = setTimeout(
+ finishPlayback,
+ Math.max(0, (last.endTime - audioCtx.currentTime) * 1000),
+ );
+ } else {
+ // Next sentence isn't synthesized yet; park. scheduleOne() re-arms when it arrives.
+ highlightTimer = null;
+ }
+ }
+
+ // Start the highlight chain if it is idle. This `highlightTimer === null` gate is the ONLY
+ // place a chain is started (besides its own self-re-arm), preventing two concurrent chains.
+ function ensureHighlight() {
+ if (highlightTimer === null) {
+ armHighlight();
+ }
+ }
+
+ // Seek: (re)play starting from a given sentence, reusing the retained buffers. Does NOT
+ // bump synthGeneration, so any in-flight synthesis keeps running and its tail appends to
+ // this fresh schedule. Triggered by clicking a sentence.
+ function playFrom(index) {
+ clearSchedule();
+ highlightIndex = index;
+ nextStartTime = 0;
+ const spans = sentences();
+ for (let i = index; i < sentenceBuffers.length; i++) {
+ scheduleOne(spans[i], sentenceBuffers[i]);
+ }
+ }
+
+ // Final cleanup when playback ends naturally: drop the highlight, return to the editor,
+ // and reset the UI to idle.
+ function finishPlayback() {
+ clearSchedule();
+ showEditor();
+ status.innerHTML = "Ready";
+ buttonSpeak.innerHTML = "Speak";
+ speaking = false;
+ }
+
+ async function speak() {
if (!voiceUrl) {
alert("Voice model is not set");
return;
}
- if (!voiceConfigUrl) {
+ if (!loadedVoiceUrl) {
alert("Voice config is not set");
return;
}
@@ -71,50 +266,127 @@ async function main() {
return;
}
- if (voiceUrl != loadedVoiceUrl) {
- status.innerHTML = "Loading voice...";
- await setVoice(voiceUrl, voiceConfigUrl);
- loadedVoiceUrl = voiceUrl;
- }
+
let speakerId = null;
if (speakerSelect.selectedIndex > 0) {
speakerId = parseInt(speakerSelect.value);
}
- let lengthScale = parseFloat(inputLengthScale.value);
- if (isNaN(lengthScale)) {
- lengthScale = null;
- }
+ const lengthScale = parseScaleOrNull(inputLengthScale);
+ const noiseScale = parseScaleOrNull(inputNoiseScale);
+ const noiseWScale = parseScaleOrNull(inputNoiseWScale);
- let noiseScale = parseFloat(inputNoiseScale.value);
- if (isNaN(noiseScale)) {
- noiseScale = null;
- }
+ // Fresh run: abort any in-flight synthesis (synthGeneration), tear down playback, and
+ // reset the playhead, retained buffers, and view. clearSchedule does not touch
+ // highlightIndex, so reset it here.
+ const generation = ++synthGeneration;
+ clearSchedule();
+ synthDone = false;
+ sentenceBuffers = [];
+ highlightIndex = 0;
+ nextStartTime = 0;
- let noiseWScale = parseFloat(inputNoiseWScale.value);
- if (isNaN(noiseWScale)) {
- noiseWScale = null;
+ if (!audioCtx) {
+ audioCtx = new AudioContext();
}
+ await audioCtx.resume(); // requires a user gesture, which this click is
+
+ const sampleRate = getSampleRate();
+
+ // Swap the editable textarea for the read-only highlight view, which fills in sentence
+ // by sentence as synthesis progresses.
+ resetHighlightView();
+ showHighlightView();
+ let viewCursor = 0; // Char offset already emitted into the view.
status.innerHTML = "Synthesizing audio...";
- const wavAudio = await textToWavAudio(
- text,
- speakerId,
- lengthScale,
- noiseScale,
- noiseWScale,
- );
- const audioURL = URL.createObjectURL(wavAudio);
+ try {
+ for await (const { audio, start, end } of textToAudioSentences(
+ text,
+ speakerId,
+ lengthScale,
+ noiseScale,
+ noiseWScale,
+ )) {
+ // A newer Speak/Stop superseded us while we were synthesizing. (A seek does NOT
+ // bump synthGeneration, so this keeps going across seeks.)
+ if (generation !== synthGeneration) {
+ return;
+ }
- audioTTS.src = audioURL;
- audioTTS.play();
+ // Append any text between the previous sentence and this one as plain text, then
+ // the sentence itself as a clickable span (click = seek here). Timing fields start
+ // undefined so the highlight loop never matches an unscheduled span.
+ if (start > viewCursor) {
+ highlightView.appendChild(
+ document.createTextNode(text.slice(viewCursor, start)),
+ );
+ }
+ const index = sentenceBuffers.length;
+ const span = document.createElement("span");
+ span.className = "sentence";
+ span.textContent = text.slice(start, end);
+ span.startTime = undefined;
+ span.endTime = undefined;
+ span.addEventListener("click", () => playFrom(index));
+ highlightView.appendChild(span);
+ viewCursor = end;
- status.innerHTML = "Ready";
+ // Retain the decoded audio and schedule this one sentence onto the current timeline
+ // (streaming appends exactly one; a seek to an earlier sentence is handled by playFrom).
+ const buffer = audioCtx.createBuffer(1, audio.length, sampleRate);
+ buffer.copyToChannel(audio, 0);
+ sentenceBuffers.push(buffer);
+ scheduleOne(span, buffer);
+ }
+ } catch (e) {
+ status.innerHTML = "Error while synthesizing";
+ clearSchedule();
+ showEditor();
+ throw e;
+ }
+
+ // All sentences produced. Mark done, THEN kick the highlight chain: if synthesis briefly
+ // lagged playback the chain parked on the last sentence with synthDone still false, and
+ // this is what arms the finish timer so the view reverts. (Empty text scheduled nothing.)
+ synthDone = true;
+ if (sentenceBuffers.length === 0) {
+ finishPlayback();
+ } else {
+ ensureHighlight();
+ }
+ }
+
+
+ buttonSpeak.addEventListener("click", async () => {
+ if (!speaking) {
+ speaking = true;
+ buttonSpeak.innerHTML = "Stop";
+ try {
+ // Stays "speaking" through playback; finishPlayback() resets the UI when the
+ // audio plays out. speak() resolves once synthesis is scheduled, not when audio ends.
+ await speak();
+ } catch {
+ // speak() already restored the editor and set an error status.
+ speaking = false;
+ buttonSpeak.innerHTML = "Speak";
+ }
+ } else {
+ // If the user clicks Stop while we're still speaking, stop immediately: abort synth
+ // (synthGeneration) and tear down playback (clearSchedule).
+ synthGeneration++;
+ clearSchedule();
+ showEditor();
+ speaking = false;
+ status.innerHTML = "Ready";
+ buttonSpeak.innerHTML = "Speak";
+ }
});
textInput.disabled = false;
- buttonSpeak.disabled = false;
+ buttonSpeak.disabled = true;
+ status.innerHTML = "Load voice to begin";
fileModel.value = "";
fileConfig.value = "";
}
@@ -138,19 +410,18 @@ function updateUIForConfig(voiceConfig) {
} else {
// Multi-speaker model
const speakerIdMap = voiceConfig.speaker_id_map;
- let sortedSpeakers = Object.keys(speakerIdMap).sort(
+ const sortedSpeakers = Object.keys(speakerIdMap).sort(
(a, b) => speakerIdMap[a] - speakerIdMap[b],
);
- for (let i in sortedSpeakers) {
- let speaker = sortedSpeakers[i];
- let option = document.createElement("option");
- option.text = speaker + " (" + i.toString() + ")";
- option.value = i.toString();
+ for (const speaker of sortedSpeakers) {
+ const id = speakerIdMap[speaker];
+ const option = document.createElement("option");
+ option.text = `${speaker} (${id})`;
+ option.value = String(id);
speakerSelect.add(option);
}
- const selectSpeaker = document.getElementById("divSpeaker");
- divSpeaker.hidden = false;
+ document.getElementById("divSpeaker").hidden = false;
}
if (speakerSelect.options.length > 1) {
diff --git a/resources/piper.js b/resources/piper.js
index 82ebdb1..8225738 100644
--- a/resources/piper.js
+++ b/resources/piper.js
@@ -2,6 +2,14 @@
import EspeakModule from "./espeakng.worker.js";
+// Run onnxruntime inference in a Web Worker so it doesn't block the UI thread.
+ort.env.wasm.proxy = true;
+
+// Use multiple threads for inference. This only takes effect when the page is
+// cross-origin isolated (COOP + COEP headers -> SharedArrayBuffer available);
+// otherwise onnxruntime-web silently falls back to a single thread. See serve.py.
+ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4;
+
const AUDIO_OUTPUT_SYNCHRONOUS = 2;
const espeakCHARS_AUTO = 0;
@@ -26,7 +34,6 @@ const EOS = "$";
const PAD = "_";
let espeakInstance = null;
-let espeakInitialized = false;
let voiceModel = null;
let voiceConfig = null;
@@ -49,29 +56,90 @@ async function setVoice(voiceModelUrl, voiceConfigUrl = undefined) {
voiceModel = await ort.InferenceSession.create(voiceModelUrl);
}
+function getSampleRate() {
+ if (!voiceConfig) {
+ throw new Error("Voice is not set");
+ }
+ return voiceConfig.audio.sample_rate;
+}
+
+// Resolve scale arguments, falling back to the voice config defaults.
+function resolveScales(lengthScale, noiseScale, noiseWScale) {
+ return {
+ lengthScale: lengthScale ?? voiceConfig.inference.length_scale ?? 1.0,
+ noiseScale: noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667,
+ noiseWScale: noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8,
+ };
+}
+
+// Run the ONNX model on a single utterance's phoneme ids, returning Float32 PCM.
+async function synthesizeIds(
+ phonemeIds,
+ speakerId,
+ lengthScale,
+ noiseScale,
+ noiseWScale,
+) {
+ const phonemeIdsTensor = new ort.Tensor(
+ "int64",
+ new BigInt64Array(phonemeIds.map((x) => BigInt(x))),
+ [1, phonemeIds.length],
+ );
+ const phonemeLengthsTensor = new ort.Tensor(
+ "int64",
+ BigInt64Array.from([BigInt(phonemeIds.length)]),
+ [1],
+ );
+ const scalesTensor = new ort.Tensor(
+ "float32",
+ Float32Array.from([noiseScale, lengthScale, noiseWScale]),
+ [3],
+ );
+
+ let feeds = {
+ input: phonemeIdsTensor,
+ input_lengths: phonemeLengthsTensor,
+ scales: scalesTensor,
+ };
+
+ if (voiceConfig.num_speakers > 1) {
+ // Multi-speaker
+ feeds["sid"] = new ort.Tensor(
+ "int64",
+ BigInt64Array.from([BigInt(speakerId ?? 0)]),
+ );
+ }
+
+ const results = await voiceModel.run(feeds);
+ return results.output.cpuData;
+}
+
+// Currently unused by the demo (kept for the public API; the demo streams via
+// textToAudioSentences instead).
async function textToWavAudio(
text,
speakerId = undefined,
- noiseScale = undefined,
lengthScale = undefined,
+ noiseScale = undefined,
noiseWScale = undefined,
) {
if (!voiceConfig) {
throw new Error("Voice is not set");
}
- const sampleRate = voiceConfig.audio.sample_rate;
const float32Audio = await textToFloat32Audio(
text,
speakerId,
- noiseScale,
lengthScale,
+ noiseScale,
noiseWScale,
);
- return float32ToWavBlob(float32Audio, sampleRate);
+ return float32ToWavBlob(float32Audio, getSampleRate());
}
+// Currently unused by the demo (kept for the public API; the demo streams via
+// textToAudioSentences instead).
async function textToFloat32Audio(
text,
speakerId = undefined,
@@ -83,62 +151,90 @@ async function textToFloat32Audio(
throw new Error("Voice is not set");
}
- lengthScale = lengthScale ?? voiceConfig.inference.length_scale ?? 1.0;
- noiseScale = noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667;
- noiseWScale = noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8;
+ const scales = resolveScales(lengthScale, noiseScale, noiseWScale);
- if (voiceConfig.num_speakers > 1) {
- speakerId = speakerId ?? 0; // first speaker
- }
-
- const textPhonemes = textToPhonemes(text);
+ const textPhonemes = textToPhonemes(text).map((segment) => segment.phonemes);
const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes);
- // Run onnx model
- const phonemeIdsTensor = new ort.Tensor(
- "int64",
- new BigInt64Array(phonemeIds.map((x) => BigInt(x))),
- [1, phonemeIds.length],
- );
- const phonemeLengthsTensor = new ort.Tensor(
- "int64",
- BigInt64Array.from([BigInt(phonemeIds.length)]),
- [1],
- );
- const scalesTensor = new ort.Tensor(
- "float32",
- Float32Array.from([noiseScale, lengthScale, noiseWScale]),
- [3],
+ return synthesizeIds(
+ phonemeIds,
+ speakerId,
+ scales.lengthScale,
+ scales.noiseScale,
+ scales.noiseWScale,
);
+}
- let feeds = {
- input: phonemeIdsTensor,
- input_lengths: phonemeLengthsTensor,
- scales: scalesTensor,
- };
+// Synthesize a sentence at a time, yielding Float32 PCM for each as soon as it is
+// ready. Lets the caller start playing early instead of waiting for the whole text.
+async function* textToAudioSentences(
+ text,
+ speakerId = undefined,
+ lengthScale = undefined,
+ noiseScale = undefined,
+ noiseWScale = undefined,
+) {
+ if (!voiceConfig) {
+ throw new Error("Voice is not set");
+ }
- if (voiceConfig.num_speakers > 1) {
- // Multi-speaker
- feeds["sid"] = new ort.Tensor(
- "int64",
- BigInt64Array.from([BigInt(speakerId)]),
+ const scales = resolveScales(lengthScale, noiseScale, noiseWScale);
+
+ // textToPhonemes already segments into per-sentence { phonemes, start, end }.
+ const sentences = textToPhonemes(text);
+
+ for (const sentence of sentences) {
+ const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, [sentence.phonemes]);
+ const audio = await synthesizeIds(
+ phonemeIds,
+ speakerId,
+ scales.lengthScale,
+ scales.noiseScale,
+ scales.noiseWScale,
);
+ // start/end are character indices into `text`, so the caller can highlight the slice
+ // this audio was synthesized from.
+ yield { audio, start: sentence.start, end: sentence.end };
}
+}
- const results = await voiceModel.run(feeds);
- const float32Audio = results.output.cpuData;
+function utf8ByteLength(codePoint) {
+ if (codePoint <= 0x7f) return 1;
+ if (codePoint <= 0x7ff) return 2;
+ if (codePoint <= 0xffff) return 3;
+ return 4;
+}
- return float32Audio;
+// espeak reports clause boundaries as UTF-8 byte offsets, but the displayed text is
+// indexed in JS string units. Those offsets only ever move forward, so we translate them
+// with a single forward-walking cursor (no lookup table): each call advances through the
+// string until it reaches the requested byte offset and returns the character index there.
+function makeByteToCharCursor(text) {
+ let byte = 0;
+ let char = 0; // JS string index == character index (surrogate pairs count as 2).
+ return (targetByte) => {
+ while (byte < targetByte && char < text.length) {
+ const codePoint = text.codePointAt(char);
+ byte += utf8ByteLength(codePoint);
+ // Advance one whole character: astral code points are a surrogate pair, so they
+ // occupy two UTF-16 string indices; everything in the BMP occupies one.
+ char += codePoint > 0xffff ? 2 : 1;
+ }
+ return char;
+ };
}
+// Segment text into per-sentence units. Returns an array of
+// { phonemes, start, end } where start/end are character indices into the original
+// `text`, identifying the slice each sentence was synthesized from.
function textToPhonemes(text) {
if (!voiceConfig) {
throw new Error("Voice is not set");
}
if (voiceConfig.phoneme_type == "text") {
- // Text phonemes
- return [Array.from(text.normalize("NFD"))];
+ // Text phonemes: the whole text is a single sentence.
+ return [{ phonemes: Array.from(text.normalize("NFD")), start: 0, end: text.length }];
}
if (!espeakInstance) {
@@ -148,26 +244,16 @@ function textToPhonemes(text) {
const voice = voiceConfig.espeak.voice;
// Set voice
- const voicePtr = espeakInstance._malloc(
- espeakInstance.lengthBytesUTF8(voice) + 1,
- );
- espeakInstance.stringToUTF8(
- voice,
- voicePtr,
- espeakInstance.lengthBytesUTF8(voice) + 1,
- );
+ const voiceBytes = espeakInstance.lengthBytesUTF8(voice) + 1;
+ const voicePtr = espeakInstance._malloc(voiceBytes);
+ espeakInstance.stringToUTF8(voice, voicePtr, voiceBytes);
espeakInstance._espeak_SetVoiceByName(voicePtr);
espeakInstance._free(voicePtr);
// Prepare text
- const textPtr = espeakInstance._malloc(
- espeakInstance.lengthBytesUTF8(text) + 1,
- );
- espeakInstance.stringToUTF8(
- text,
- textPtr,
- espeakInstance.lengthBytesUTF8(text) + 1,
- );
+ const textBytes = espeakInstance.lengthBytesUTF8(text) + 1;
+ const textPtr = espeakInstance._malloc(textBytes);
+ espeakInstance.stringToUTF8(text, textPtr, textBytes);
const textPtrPtr = espeakInstance._malloc(4);
espeakInstance.setValue(textPtrPtr, textPtr, "*");
@@ -175,13 +261,27 @@ function textToPhonemes(text) {
// End of clause and sentences
const terminatorPtr = espeakInstance._malloc(4);
- // Phoneme lists for each sentence
+ // Translates espeak's UTF-8 byte offsets to character indices into the original `text`
+ // so they can slice/highlight it directly.
+ const toChar = makeByteToCharCursor(text);
+
+ // Sentence segments, each { phonemes, start, end } in character indices.
const textPhonemes = [];
// Phoneme list for current sentence
let sentencePhonemes = [];
+ // Character offsets: where the next clause begins, and where the current sentence
+ // (accumulation of clauses) began.
+ let cursorChar = 0;
+ let sentenceStartChar = 0;
+
while (true) {
+ // A new sentence is starting if we haven't accumulated any clauses for it yet.
+ if (sentencePhonemes.length === 0) {
+ sentenceStartChar = cursorChar;
+ }
+
const phonemesPtr = espeakInstance._espeak_TextToPhonemesWithTerminator(
textPtrPtr,
espeakCHARS_AUTO,
@@ -209,13 +309,28 @@ function textToPhonemes(text) {
sentencePhonemes.push("; ");
}
+ // Where espeak will resume. 0 means the input is exhausted (this clause runs to the
+ // end of the text). Otherwise espeak reads one lookahead character past the clause
+ // boundary, so its resume offset overshoots the true boundary by exactly one
+ // character — subtract it back off (in character space) to land on the start of the
+ // next clause.
+ const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*");
+ const endChar =
+ nextTextPtr === 0
+ ? text.length
+ : Math.max(cursorChar, toChar(nextTextPtr - textPtr) - 1);
+ cursorChar = endChar;
+
if ((terminator & CLAUSE_TYPE_SENTENCE) === CLAUSE_TYPE_SENTENCE) {
// End of sentence
- textPhonemes.push(sentencePhonemes);
+ textPhonemes.push({
+ phonemes: sentencePhonemes,
+ start: sentenceStartChar,
+ end: endChar,
+ });
sentencePhonemes = [];
}
- const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*");
if (nextTextPtr === 0) {
break; // All text processed
}
@@ -231,16 +346,20 @@ function textToPhonemes(text) {
// Add lingering phonemes
if (sentencePhonemes.length > 0) {
- textPhonemes.push(sentencePhonemes);
+ textPhonemes.push({
+ phonemes: sentencePhonemes,
+ start: sentenceStartChar,
+ end: text.length,
+ });
sentencePhonemes = [];
}
- // Prepare phonemes for Piper
- for (let i = 0; i < textPhonemes.length; i++) {
- textPhonemes[i] = Array.from(textPhonemes[i].join("").normalize("NFD"));
- }
-
- return textPhonemes;
+ // Prepare phonemes for Piper; start/end are already character indices into `text`.
+ return textPhonemes.map((segment) => ({
+ phonemes: Array.from(segment.phonemes.join("").normalize("NFD")),
+ start: segment.start,
+ end: segment.end,
+ }));
}
function phonemesToIds(idMap, textPhonemes) {
@@ -300,4 +419,11 @@ function float32ToWavBlob(floatArray, sampleRate) {
return new Blob([view], { type: "audio/wav" });
}
-export { setVoice, textToWavAudio, textToFloat32Audio };
+export {
+ setVoice,
+ textToWavAudio,
+ textToFloat32Audio,
+ textToAudioSentences,
+ float32ToWavBlob,
+ getSampleRate,
+};
diff --git a/serve.py b/serve.py
new file mode 100644
index 0000000..b835ca5
--- /dev/null
+++ b/serve.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""Static server for the Piper demo with cross-origin isolation enabled.
+
+onnxruntime-web can only run multi-threaded WASM (using SharedArrayBuffer) when
+the page is "cross-origin isolated". That requires two response headers that the
+stock `python -m http.server` does not send:
+
+ Cross-Origin-Opener-Policy: same-origin
+ Cross-Origin-Embedder-Policy: credentialless
+
+We use `credentialless` rather than `require-corp`: both enable cross-origin
+isolation (and thus SharedArrayBuffer / threads), but `credentialless` still lets
+no-cors cross-origin assets load (e.g. the sponsor badge), instead of blocking
+anything that doesn't send a CORP header. Supported in Firefox 119+ / Chrome 110+.
+
+Run this from the demo directory:
+
+ python serve.py # serves on http://localhost:8080
+ python serve.py 8080 # custom port
+
+After loading the page, confirm in the browser console:
+
+ crossOriginIsolated === true
+
+If that is false, the headers are not reaching the browser and inference will
+stay single-threaded.
+"""
+
+import sys
+from functools import partial
+from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+
+
+class CrossOriginIsolatedHandler(SimpleHTTPRequestHandler):
+ def end_headers(self):
+ self.send_header("Cross-Origin-Opener-Policy", "same-origin")
+ self.send_header("Cross-Origin-Embedder-Policy", "credentialless")
+ # Avoid stale assets while developing.
+ self.send_header("Cache-Control", "no-store")
+ super().end_headers()
+
+
+def main():
+ port = int(sys.argv[1]) if len(sys.argv) > 1 else 8080
+ handler = partial(CrossOriginIsolatedHandler, directory=".")
+ with ThreadingHTTPServer(("0.0.0.0", port), handler) as httpd:
+ print(f"Serving cross-origin-isolated demo on http://localhost:{port}")
+ print("Confirm `crossOriginIsolated === true` in the browser console.")
+ try:
+ httpd.serve_forever()
+ except KeyboardInterrupt:
+ print("\nStopped.")
+
+
+if __name__ == "__main__":
+ main()