diff --git a/.gitignore b/.gitignore index c995e60..d560899 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,6 @@ output/ /install/ .python-version + +.vscode/ +.claude/ diff --git a/README.md b/README.md index e2f0912..487f5ec 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ # Piper Samples -Samples for [Piper](https://github.com/rhasspy/piper) text to speech system. + +Samples for [Piper](https://github.com/OHF-Voice/piper1-gpl) text to speech system. + +## Demo +to run demo `python serve.py` - it provides COOP and COEP headers required for onnx runtime to run with multiple threads. \ No newline at end of file diff --git a/demo.html b/demo.html index e63b939..77dddc9 100644 --- a/demo.html +++ b/demo.html @@ -9,8 +9,41 @@ max-width: 90%; } - #textInput { + /* Shared box so the highlight view and the textarea look identical when swapped. */ + #textInput, #highlightView { width: 100%; + box-sizing: border-box; + font-family: inherit; + font-size: 1em; + line-height: 1.4; + padding: 4px; + border: 1px solid #767676; + border-radius: 2px; + } + + #highlightView { + min-height: calc(5 * 1.4em + 8px); /* ~5 rows + padding, matching the textarea */ + max-height: 40vh; + overflow: auto; + white-space: pre-wrap; + word-wrap: break-word; + background: #fff; + } + + /* Each sentence is clickable to seek playback there. */ + #highlightView .sentence { + cursor: pointer; + border-radius: 2px; + } + + /* Lighter tint on hover advertises the click-to-seek affordance; :not(.active) keeps + the solid highlight on the sentence that is actually playing. */ + #highlightView .sentence:hover:not(.active) { + background: #fff3c4; + } + + #highlightView .sentence.active { + background: #ffe08a; } #logo { @@ -34,11 +67,6 @@ margin-bottom: 20px; } - #divSpeak > audio { - vertical-align: bottom; - margin-left: 10px; - } - #status { margin-left: 10px; } @@ -75,10 +103,10 @@ +
- Ready
diff --git a/resources/demo.js b/resources/demo.js index 30d5310..67bcd0b 100644 --- a/resources/demo.js +++ b/resources/demo.js @@ -1,24 +1,60 @@ -import { setVoice, textToWavAudio } from "./piper.js"; +import { + setVoice, + textToAudioSentences, + getSampleRate, +} from "./piper.js"; let voiceUrl = ""; let loadedVoiceUrl = ""; let voiceConfigUrl = ""; +// Silence inserted between sentences when scheduling live playback. Tune to taste. +const SENTENCE_GAP_SECONDS = 0.2; + +// Web Audio playback state (created lazily on first user gesture, reused after). +let audioCtx = null; +// Bumped on Speak/Stop to abort an in-flight synthesis stream. A seek does NOT bump it, so +// clicking a sentence reschedules playback without killing ongoing synthesis. +let synthGeneration = 0; +// True once synthesis has produced every sentence, so the highlight chain knows it may end +// (revert to the editor) when the audio passes the last sentence rather than mid-stream. +let synthDone = false; +// Each sentence's decoded audio, kept index-aligned with the .sentence spans in the view. +// Retained so a seek can replay without re-synthesizing; never cleared by clearSchedule. +let sentenceBuffers = []; +// Audio-clock time the next scheduled source should start at. Per-run scheduling timing and +// sources live on the spans themselves. +let nextStartTime = 0; +// The highlight chain (see armHighlight): the sentence it is about to light, and the single +// pending setTimeout handle. `highlightTimer === null` means the chain is idle/parked, and is +// the sole guard against starting a second chain. +let highlightIndex = 0; +let highlightTimer = null; + +// Read a numeric scale input, returning null when blank/invalid so piper falls back to the +// voice config default. +function parseScaleOrNull(input) { + const value = parseFloat(input.value); + return isNaN(value) ? null : value; +} + async function main() { const fileModel = document.getElementById("fileModel"); const fileConfig = document.getElementById("fileConfig"); const divConfig = document.getElementById("divConfig"); const buttonSpeak = document.getElementById("buttonSpeak"); - const audioTTS = document.getElementById("audioTTS"); const textInput = document.getElementById("textInput"); + const highlightView = document.getElementById("highlightView"); const status = document.getElementById("status"); const speakerSelect = document.getElementById("speaker"); const inputLengthScale = document.getElementById("lengthScale"); const inputNoiseScale = document.getElementById("noiseScale"); const inputNoiseWScale = document.getElementById("noiseWScale"); - fileModel.addEventListener("change", async () => { - const file = event.target.files[0]; + let speaking = false; + + fileModel.addEventListener("change", async (e) => { + const file = e.target.files[0]; if (!file) { return; } @@ -26,6 +62,7 @@ async function main() { // Reset config voiceConfigUrl = ""; fileConfig.value = ""; + divConfig.hidden = true; speakerSelect.value = ""; voiceUrl = URL.createObjectURL(file); @@ -37,14 +74,17 @@ async function main() { const voiceConfig = await response.json(); updateUIForConfig(voiceConfig); divConfig.hidden = true; + await loadVoice(); + status.innerHTML = "Ready"; + buttonSpeak.disabled = false; } else { divConfig.hidden = false; speakerSelect.hidden = true; } }); - fileConfig.addEventListener("change", async () => { - const file = event.target.files[0]; + fileConfig.addEventListener("change", async (e) => { + const file = e.target.files[0]; if (!file) { return; } @@ -52,15 +92,170 @@ async function main() { const voiceConfig = JSON.parse(await file.text()); updateUIForConfig(voiceConfig); voiceConfigUrl = URL.createObjectURL(file); + await loadVoice(); + status.innerHTML = "Ready"; + buttonSpeak.disabled = false; }); - buttonSpeak.addEventListener("click", async () => { + async function loadVoice() { + if (voiceUrl != loadedVoiceUrl) { + status.innerHTML = "Loading voice..."; + try { + await setVoice(voiceUrl, voiceConfigUrl); + } + catch (e) { + status.innerHTML = "Error loading voice"; + throw e; + } + loadedVoiceUrl = voiceUrl; + } + } + + function showHighlightView() { + textInput.hidden = true; + highlightView.hidden = false; + } + + function showEditor() { + highlightView.hidden = true; + textInput.hidden = false; + } + + // The sentence spans, in document order — index === sentence index === sentenceBuffers + // index. The DOM is the list; no separate array is kept. + function sentences() { + return [...highlightView.querySelectorAll(".sentence")]; + } + + // Reset the read-only view to empty, ready to receive per-sentence spans. + function resetHighlightView() { + highlightView.textContent = ""; + } + + // Tear down the current playback run: cancel the pending highlight timer and stop every + // sounding source, and clear each span's per-run timing/highlight. Leaves sentenceBuffers + // and the spans themselves intact, so a seek can re-schedule from them. Always nulls + // highlightTimer — and the clearTimeout is what makes the chain's captured spans seek-safe + // (a seek cancels a pending fire before it can light a now-stale span). + function clearSchedule() { + if (highlightTimer !== null) { + clearTimeout(highlightTimer); + highlightTimer = null; + } + for (const span of sentences()) { + if (span.source) { + try { + span.source.stop(); + } catch { + // Already stopped/ended. + } + span.source = null; + } + span.startTime = undefined; + span.endTime = undefined; + span.classList.remove("active"); + } + } + + // Schedule one sentence to play right after the previously scheduled one, recording its + // timing and source on the span, then make sure the highlight chain is running. + function scheduleOne(span, buffer) { + const source = audioCtx.createBufferSource(); + source.buffer = buffer; + source.connect(audioCtx.destination); + + if (nextStartTime === 0) { + nextStartTime = audioCtx.currentTime + 0.1; // small lead-in + } + // Never schedule in the past: a slow synth yields a gap, not an overlap. + nextStartTime = Math.max(nextStartTime, audioCtx.currentTime); + source.start(nextStartTime); + span.source = source; + span.startTime = nextStartTime; + span.endTime = nextStartTime + buffer.duration; + nextStartTime = span.endTime + SENTENCE_GAP_SECONDS; + + status.innerHTML = "Speaking..."; + ensureHighlight(); + } + + // Move the highlight to a span. + function setActive(span) { + const previous = highlightView.querySelector(".sentence.active"); + if (previous) { + previous.classList.remove("active"); + } + span.classList.add("active"); + span.scrollIntoView({ block: "nearest" }); + } + + // Arm the single timer for the next highlight transition, keyed to the audio clock. The + // span's scheduled start is in the future, so each delay is re-derived from the live + // audioCtx.currentTime — no drift accumulates and inter-sentence gaps are handled because + // we fire on the next sentence's start, keeping the previous one lit until then. + function armHighlight() { + const spans = sentences(); + const next = spans[highlightIndex]; + if (next && next.startTime !== undefined) { + highlightTimer = setTimeout( + () => { + setActive(next); + highlightIndex++; + armHighlight(); + }, + Math.max(0, (next.startTime - audioCtx.currentTime) * 1000), + ); + } else if (synthDone) { + // Everything is highlighted; revert to the editor after the last sentence ends. + const last = spans[spans.length - 1]; + highlightTimer = setTimeout( + finishPlayback, + Math.max(0, (last.endTime - audioCtx.currentTime) * 1000), + ); + } else { + // Next sentence isn't synthesized yet; park. scheduleOne() re-arms when it arrives. + highlightTimer = null; + } + } + + // Start the highlight chain if it is idle. This `highlightTimer === null` gate is the ONLY + // place a chain is started (besides its own self-re-arm), preventing two concurrent chains. + function ensureHighlight() { + if (highlightTimer === null) { + armHighlight(); + } + } + + // Seek: (re)play starting from a given sentence, reusing the retained buffers. Does NOT + // bump synthGeneration, so any in-flight synthesis keeps running and its tail appends to + // this fresh schedule. Triggered by clicking a sentence. + function playFrom(index) { + clearSchedule(); + highlightIndex = index; + nextStartTime = 0; + const spans = sentences(); + for (let i = index; i < sentenceBuffers.length; i++) { + scheduleOne(spans[i], sentenceBuffers[i]); + } + } + + // Final cleanup when playback ends naturally: drop the highlight, return to the editor, + // and reset the UI to idle. + function finishPlayback() { + clearSchedule(); + showEditor(); + status.innerHTML = "Ready"; + buttonSpeak.innerHTML = "Speak"; + speaking = false; + } + + async function speak() { if (!voiceUrl) { alert("Voice model is not set"); return; } - if (!voiceConfigUrl) { + if (!loadedVoiceUrl) { alert("Voice config is not set"); return; } @@ -71,50 +266,127 @@ async function main() { return; } - if (voiceUrl != loadedVoiceUrl) { - status.innerHTML = "Loading voice..."; - await setVoice(voiceUrl, voiceConfigUrl); - loadedVoiceUrl = voiceUrl; - } + let speakerId = null; if (speakerSelect.selectedIndex > 0) { speakerId = parseInt(speakerSelect.value); } - let lengthScale = parseFloat(inputLengthScale.value); - if (isNaN(lengthScale)) { - lengthScale = null; - } + const lengthScale = parseScaleOrNull(inputLengthScale); + const noiseScale = parseScaleOrNull(inputNoiseScale); + const noiseWScale = parseScaleOrNull(inputNoiseWScale); - let noiseScale = parseFloat(inputNoiseScale.value); - if (isNaN(noiseScale)) { - noiseScale = null; - } + // Fresh run: abort any in-flight synthesis (synthGeneration), tear down playback, and + // reset the playhead, retained buffers, and view. clearSchedule does not touch + // highlightIndex, so reset it here. + const generation = ++synthGeneration; + clearSchedule(); + synthDone = false; + sentenceBuffers = []; + highlightIndex = 0; + nextStartTime = 0; - let noiseWScale = parseFloat(inputNoiseWScale.value); - if (isNaN(noiseWScale)) { - noiseWScale = null; + if (!audioCtx) { + audioCtx = new AudioContext(); } + await audioCtx.resume(); // requires a user gesture, which this click is + + const sampleRate = getSampleRate(); + + // Swap the editable textarea for the read-only highlight view, which fills in sentence + // by sentence as synthesis progresses. + resetHighlightView(); + showHighlightView(); + let viewCursor = 0; // Char offset already emitted into the view. status.innerHTML = "Synthesizing audio..."; - const wavAudio = await textToWavAudio( - text, - speakerId, - lengthScale, - noiseScale, - noiseWScale, - ); - const audioURL = URL.createObjectURL(wavAudio); + try { + for await (const { audio, start, end } of textToAudioSentences( + text, + speakerId, + lengthScale, + noiseScale, + noiseWScale, + )) { + // A newer Speak/Stop superseded us while we were synthesizing. (A seek does NOT + // bump synthGeneration, so this keeps going across seeks.) + if (generation !== synthGeneration) { + return; + } - audioTTS.src = audioURL; - audioTTS.play(); + // Append any text between the previous sentence and this one as plain text, then + // the sentence itself as a clickable span (click = seek here). Timing fields start + // undefined so the highlight loop never matches an unscheduled span. + if (start > viewCursor) { + highlightView.appendChild( + document.createTextNode(text.slice(viewCursor, start)), + ); + } + const index = sentenceBuffers.length; + const span = document.createElement("span"); + span.className = "sentence"; + span.textContent = text.slice(start, end); + span.startTime = undefined; + span.endTime = undefined; + span.addEventListener("click", () => playFrom(index)); + highlightView.appendChild(span); + viewCursor = end; - status.innerHTML = "Ready"; + // Retain the decoded audio and schedule this one sentence onto the current timeline + // (streaming appends exactly one; a seek to an earlier sentence is handled by playFrom). + const buffer = audioCtx.createBuffer(1, audio.length, sampleRate); + buffer.copyToChannel(audio, 0); + sentenceBuffers.push(buffer); + scheduleOne(span, buffer); + } + } catch (e) { + status.innerHTML = "Error while synthesizing"; + clearSchedule(); + showEditor(); + throw e; + } + + // All sentences produced. Mark done, THEN kick the highlight chain: if synthesis briefly + // lagged playback the chain parked on the last sentence with synthDone still false, and + // this is what arms the finish timer so the view reverts. (Empty text scheduled nothing.) + synthDone = true; + if (sentenceBuffers.length === 0) { + finishPlayback(); + } else { + ensureHighlight(); + } + } + + + buttonSpeak.addEventListener("click", async () => { + if (!speaking) { + speaking = true; + buttonSpeak.innerHTML = "Stop"; + try { + // Stays "speaking" through playback; finishPlayback() resets the UI when the + // audio plays out. speak() resolves once synthesis is scheduled, not when audio ends. + await speak(); + } catch { + // speak() already restored the editor and set an error status. + speaking = false; + buttonSpeak.innerHTML = "Speak"; + } + } else { + // If the user clicks Stop while we're still speaking, stop immediately: abort synth + // (synthGeneration) and tear down playback (clearSchedule). + synthGeneration++; + clearSchedule(); + showEditor(); + speaking = false; + status.innerHTML = "Ready"; + buttonSpeak.innerHTML = "Speak"; + } }); textInput.disabled = false; - buttonSpeak.disabled = false; + buttonSpeak.disabled = true; + status.innerHTML = "Load voice to begin"; fileModel.value = ""; fileConfig.value = ""; } @@ -138,19 +410,18 @@ function updateUIForConfig(voiceConfig) { } else { // Multi-speaker model const speakerIdMap = voiceConfig.speaker_id_map; - let sortedSpeakers = Object.keys(speakerIdMap).sort( + const sortedSpeakers = Object.keys(speakerIdMap).sort( (a, b) => speakerIdMap[a] - speakerIdMap[b], ); - for (let i in sortedSpeakers) { - let speaker = sortedSpeakers[i]; - let option = document.createElement("option"); - option.text = speaker + " (" + i.toString() + ")"; - option.value = i.toString(); + for (const speaker of sortedSpeakers) { + const id = speakerIdMap[speaker]; + const option = document.createElement("option"); + option.text = `${speaker} (${id})`; + option.value = String(id); speakerSelect.add(option); } - const selectSpeaker = document.getElementById("divSpeaker"); - divSpeaker.hidden = false; + document.getElementById("divSpeaker").hidden = false; } if (speakerSelect.options.length > 1) { diff --git a/resources/piper.js b/resources/piper.js index 82ebdb1..8225738 100644 --- a/resources/piper.js +++ b/resources/piper.js @@ -2,6 +2,14 @@ import EspeakModule from "./espeakng.worker.js"; +// Run onnxruntime inference in a Web Worker so it doesn't block the UI thread. +ort.env.wasm.proxy = true; + +// Use multiple threads for inference. This only takes effect when the page is +// cross-origin isolated (COOP + COEP headers -> SharedArrayBuffer available); +// otherwise onnxruntime-web silently falls back to a single thread. See serve.py. +ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4; + const AUDIO_OUTPUT_SYNCHRONOUS = 2; const espeakCHARS_AUTO = 0; @@ -26,7 +34,6 @@ const EOS = "$"; const PAD = "_"; let espeakInstance = null; -let espeakInitialized = false; let voiceModel = null; let voiceConfig = null; @@ -49,29 +56,90 @@ async function setVoice(voiceModelUrl, voiceConfigUrl = undefined) { voiceModel = await ort.InferenceSession.create(voiceModelUrl); } +function getSampleRate() { + if (!voiceConfig) { + throw new Error("Voice is not set"); + } + return voiceConfig.audio.sample_rate; +} + +// Resolve scale arguments, falling back to the voice config defaults. +function resolveScales(lengthScale, noiseScale, noiseWScale) { + return { + lengthScale: lengthScale ?? voiceConfig.inference.length_scale ?? 1.0, + noiseScale: noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667, + noiseWScale: noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8, + }; +} + +// Run the ONNX model on a single utterance's phoneme ids, returning Float32 PCM. +async function synthesizeIds( + phonemeIds, + speakerId, + lengthScale, + noiseScale, + noiseWScale, +) { + const phonemeIdsTensor = new ort.Tensor( + "int64", + new BigInt64Array(phonemeIds.map((x) => BigInt(x))), + [1, phonemeIds.length], + ); + const phonemeLengthsTensor = new ort.Tensor( + "int64", + BigInt64Array.from([BigInt(phonemeIds.length)]), + [1], + ); + const scalesTensor = new ort.Tensor( + "float32", + Float32Array.from([noiseScale, lengthScale, noiseWScale]), + [3], + ); + + let feeds = { + input: phonemeIdsTensor, + input_lengths: phonemeLengthsTensor, + scales: scalesTensor, + }; + + if (voiceConfig.num_speakers > 1) { + // Multi-speaker + feeds["sid"] = new ort.Tensor( + "int64", + BigInt64Array.from([BigInt(speakerId ?? 0)]), + ); + } + + const results = await voiceModel.run(feeds); + return results.output.cpuData; +} + +// Currently unused by the demo (kept for the public API; the demo streams via +// textToAudioSentences instead). async function textToWavAudio( text, speakerId = undefined, - noiseScale = undefined, lengthScale = undefined, + noiseScale = undefined, noiseWScale = undefined, ) { if (!voiceConfig) { throw new Error("Voice is not set"); } - const sampleRate = voiceConfig.audio.sample_rate; const float32Audio = await textToFloat32Audio( text, speakerId, - noiseScale, lengthScale, + noiseScale, noiseWScale, ); - return float32ToWavBlob(float32Audio, sampleRate); + return float32ToWavBlob(float32Audio, getSampleRate()); } +// Currently unused by the demo (kept for the public API; the demo streams via +// textToAudioSentences instead). async function textToFloat32Audio( text, speakerId = undefined, @@ -83,62 +151,90 @@ async function textToFloat32Audio( throw new Error("Voice is not set"); } - lengthScale = lengthScale ?? voiceConfig.inference.length_scale ?? 1.0; - noiseScale = noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667; - noiseWScale = noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8; + const scales = resolveScales(lengthScale, noiseScale, noiseWScale); - if (voiceConfig.num_speakers > 1) { - speakerId = speakerId ?? 0; // first speaker - } - - const textPhonemes = textToPhonemes(text); + const textPhonemes = textToPhonemes(text).map((segment) => segment.phonemes); const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes); - // Run onnx model - const phonemeIdsTensor = new ort.Tensor( - "int64", - new BigInt64Array(phonemeIds.map((x) => BigInt(x))), - [1, phonemeIds.length], - ); - const phonemeLengthsTensor = new ort.Tensor( - "int64", - BigInt64Array.from([BigInt(phonemeIds.length)]), - [1], - ); - const scalesTensor = new ort.Tensor( - "float32", - Float32Array.from([noiseScale, lengthScale, noiseWScale]), - [3], + return synthesizeIds( + phonemeIds, + speakerId, + scales.lengthScale, + scales.noiseScale, + scales.noiseWScale, ); +} - let feeds = { - input: phonemeIdsTensor, - input_lengths: phonemeLengthsTensor, - scales: scalesTensor, - }; +// Synthesize a sentence at a time, yielding Float32 PCM for each as soon as it is +// ready. Lets the caller start playing early instead of waiting for the whole text. +async function* textToAudioSentences( + text, + speakerId = undefined, + lengthScale = undefined, + noiseScale = undefined, + noiseWScale = undefined, +) { + if (!voiceConfig) { + throw new Error("Voice is not set"); + } - if (voiceConfig.num_speakers > 1) { - // Multi-speaker - feeds["sid"] = new ort.Tensor( - "int64", - BigInt64Array.from([BigInt(speakerId)]), + const scales = resolveScales(lengthScale, noiseScale, noiseWScale); + + // textToPhonemes already segments into per-sentence { phonemes, start, end }. + const sentences = textToPhonemes(text); + + for (const sentence of sentences) { + const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, [sentence.phonemes]); + const audio = await synthesizeIds( + phonemeIds, + speakerId, + scales.lengthScale, + scales.noiseScale, + scales.noiseWScale, ); + // start/end are character indices into `text`, so the caller can highlight the slice + // this audio was synthesized from. + yield { audio, start: sentence.start, end: sentence.end }; } +} - const results = await voiceModel.run(feeds); - const float32Audio = results.output.cpuData; +function utf8ByteLength(codePoint) { + if (codePoint <= 0x7f) return 1; + if (codePoint <= 0x7ff) return 2; + if (codePoint <= 0xffff) return 3; + return 4; +} - return float32Audio; +// espeak reports clause boundaries as UTF-8 byte offsets, but the displayed text is +// indexed in JS string units. Those offsets only ever move forward, so we translate them +// with a single forward-walking cursor (no lookup table): each call advances through the +// string until it reaches the requested byte offset and returns the character index there. +function makeByteToCharCursor(text) { + let byte = 0; + let char = 0; // JS string index == character index (surrogate pairs count as 2). + return (targetByte) => { + while (byte < targetByte && char < text.length) { + const codePoint = text.codePointAt(char); + byte += utf8ByteLength(codePoint); + // Advance one whole character: astral code points are a surrogate pair, so they + // occupy two UTF-16 string indices; everything in the BMP occupies one. + char += codePoint > 0xffff ? 2 : 1; + } + return char; + }; } +// Segment text into per-sentence units. Returns an array of +// { phonemes, start, end } where start/end are character indices into the original +// `text`, identifying the slice each sentence was synthesized from. function textToPhonemes(text) { if (!voiceConfig) { throw new Error("Voice is not set"); } if (voiceConfig.phoneme_type == "text") { - // Text phonemes - return [Array.from(text.normalize("NFD"))]; + // Text phonemes: the whole text is a single sentence. + return [{ phonemes: Array.from(text.normalize("NFD")), start: 0, end: text.length }]; } if (!espeakInstance) { @@ -148,26 +244,16 @@ function textToPhonemes(text) { const voice = voiceConfig.espeak.voice; // Set voice - const voicePtr = espeakInstance._malloc( - espeakInstance.lengthBytesUTF8(voice) + 1, - ); - espeakInstance.stringToUTF8( - voice, - voicePtr, - espeakInstance.lengthBytesUTF8(voice) + 1, - ); + const voiceBytes = espeakInstance.lengthBytesUTF8(voice) + 1; + const voicePtr = espeakInstance._malloc(voiceBytes); + espeakInstance.stringToUTF8(voice, voicePtr, voiceBytes); espeakInstance._espeak_SetVoiceByName(voicePtr); espeakInstance._free(voicePtr); // Prepare text - const textPtr = espeakInstance._malloc( - espeakInstance.lengthBytesUTF8(text) + 1, - ); - espeakInstance.stringToUTF8( - text, - textPtr, - espeakInstance.lengthBytesUTF8(text) + 1, - ); + const textBytes = espeakInstance.lengthBytesUTF8(text) + 1; + const textPtr = espeakInstance._malloc(textBytes); + espeakInstance.stringToUTF8(text, textPtr, textBytes); const textPtrPtr = espeakInstance._malloc(4); espeakInstance.setValue(textPtrPtr, textPtr, "*"); @@ -175,13 +261,27 @@ function textToPhonemes(text) { // End of clause and sentences const terminatorPtr = espeakInstance._malloc(4); - // Phoneme lists for each sentence + // Translates espeak's UTF-8 byte offsets to character indices into the original `text` + // so they can slice/highlight it directly. + const toChar = makeByteToCharCursor(text); + + // Sentence segments, each { phonemes, start, end } in character indices. const textPhonemes = []; // Phoneme list for current sentence let sentencePhonemes = []; + // Character offsets: where the next clause begins, and where the current sentence + // (accumulation of clauses) began. + let cursorChar = 0; + let sentenceStartChar = 0; + while (true) { + // A new sentence is starting if we haven't accumulated any clauses for it yet. + if (sentencePhonemes.length === 0) { + sentenceStartChar = cursorChar; + } + const phonemesPtr = espeakInstance._espeak_TextToPhonemesWithTerminator( textPtrPtr, espeakCHARS_AUTO, @@ -209,13 +309,28 @@ function textToPhonemes(text) { sentencePhonemes.push("; "); } + // Where espeak will resume. 0 means the input is exhausted (this clause runs to the + // end of the text). Otherwise espeak reads one lookahead character past the clause + // boundary, so its resume offset overshoots the true boundary by exactly one + // character — subtract it back off (in character space) to land on the start of the + // next clause. + const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*"); + const endChar = + nextTextPtr === 0 + ? text.length + : Math.max(cursorChar, toChar(nextTextPtr - textPtr) - 1); + cursorChar = endChar; + if ((terminator & CLAUSE_TYPE_SENTENCE) === CLAUSE_TYPE_SENTENCE) { // End of sentence - textPhonemes.push(sentencePhonemes); + textPhonemes.push({ + phonemes: sentencePhonemes, + start: sentenceStartChar, + end: endChar, + }); sentencePhonemes = []; } - const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*"); if (nextTextPtr === 0) { break; // All text processed } @@ -231,16 +346,20 @@ function textToPhonemes(text) { // Add lingering phonemes if (sentencePhonemes.length > 0) { - textPhonemes.push(sentencePhonemes); + textPhonemes.push({ + phonemes: sentencePhonemes, + start: sentenceStartChar, + end: text.length, + }); sentencePhonemes = []; } - // Prepare phonemes for Piper - for (let i = 0; i < textPhonemes.length; i++) { - textPhonemes[i] = Array.from(textPhonemes[i].join("").normalize("NFD")); - } - - return textPhonemes; + // Prepare phonemes for Piper; start/end are already character indices into `text`. + return textPhonemes.map((segment) => ({ + phonemes: Array.from(segment.phonemes.join("").normalize("NFD")), + start: segment.start, + end: segment.end, + })); } function phonemesToIds(idMap, textPhonemes) { @@ -300,4 +419,11 @@ function float32ToWavBlob(floatArray, sampleRate) { return new Blob([view], { type: "audio/wav" }); } -export { setVoice, textToWavAudio, textToFloat32Audio }; +export { + setVoice, + textToWavAudio, + textToFloat32Audio, + textToAudioSentences, + float32ToWavBlob, + getSampleRate, +}; diff --git a/serve.py b/serve.py new file mode 100644 index 0000000..b835ca5 --- /dev/null +++ b/serve.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +"""Static server for the Piper demo with cross-origin isolation enabled. + +onnxruntime-web can only run multi-threaded WASM (using SharedArrayBuffer) when +the page is "cross-origin isolated". That requires two response headers that the +stock `python -m http.server` does not send: + + Cross-Origin-Opener-Policy: same-origin + Cross-Origin-Embedder-Policy: credentialless + +We use `credentialless` rather than `require-corp`: both enable cross-origin +isolation (and thus SharedArrayBuffer / threads), but `credentialless` still lets +no-cors cross-origin assets load (e.g. the sponsor badge), instead of blocking +anything that doesn't send a CORP header. Supported in Firefox 119+ / Chrome 110+. + +Run this from the demo directory: + + python serve.py # serves on http://localhost:8080 + python serve.py 8080 # custom port + +After loading the page, confirm in the browser console: + + crossOriginIsolated === true + +If that is false, the headers are not reaching the browser and inference will +stay single-threaded. +""" + +import sys +from functools import partial +from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer + + +class CrossOriginIsolatedHandler(SimpleHTTPRequestHandler): + def end_headers(self): + self.send_header("Cross-Origin-Opener-Policy", "same-origin") + self.send_header("Cross-Origin-Embedder-Policy", "credentialless") + # Avoid stale assets while developing. + self.send_header("Cache-Control", "no-store") + super().end_headers() + + +def main(): + port = int(sys.argv[1]) if len(sys.argv) > 1 else 8080 + handler = partial(CrossOriginIsolatedHandler, directory=".") + with ThreadingHTTPServer(("0.0.0.0", port), handler) as httpd: + print(f"Serving cross-origin-isolated demo on http://localhost:{port}") + print("Confirm `crossOriginIsolated === true` in the browser console.") + try: + httpd.serve_forever() + except KeyboardInterrupt: + print("\nStopped.") + + +if __name__ == "__main__": + main()