From 70c6f07ef7ea05709a110fdc7710e44d413e4f5e Mon Sep 17 00:00:00 2001 From: Ivan Date: Mon, 8 Jun 2026 22:35:59 +0200 Subject: [PATCH 1/8] Add configuration to ort for better performance --- README.md | 6 ++++- resources/piper.js | 8 +++++++ serve.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 serve.py diff --git a/README.md b/README.md index e2f09126..487f5ecb 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ # Piper Samples -Samples for [Piper](https://github.com/rhasspy/piper) text to speech system. + +Samples for [Piper](https://github.com/OHF-Voice/piper1-gpl) text to speech system. + +## Demo +to run demo `python serve.py` - it provides COOP and COEP headers required for onnx runtime to run with multiple threads. \ No newline at end of file diff --git a/resources/piper.js b/resources/piper.js index 82ebdb1a..0521ecc6 100644 --- a/resources/piper.js +++ b/resources/piper.js @@ -2,6 +2,14 @@ import EspeakModule from "./espeakng.worker.js"; +// Run onnxruntime inference in a Web Worker so it doesn't block the UI thread. +ort.env.wasm.proxy = true; + +// Use multiple threads for inference. This only takes effect when the page is +// cross-origin isolated (COOP + COEP headers -> SharedArrayBuffer available); +// otherwise onnxruntime-web silently falls back to a single thread. See serve.py. +ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4; + const AUDIO_OUTPUT_SYNCHRONOUS = 2; const espeakCHARS_AUTO = 0; diff --git a/serve.py b/serve.py new file mode 100644 index 00000000..b835ca54 --- /dev/null +++ b/serve.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +"""Static server for the Piper demo with cross-origin isolation enabled. + +onnxruntime-web can only run multi-threaded WASM (using SharedArrayBuffer) when +the page is "cross-origin isolated". That requires two response headers that the +stock `python -m http.server` does not send: + + Cross-Origin-Opener-Policy: same-origin + Cross-Origin-Embedder-Policy: credentialless + +We use `credentialless` rather than `require-corp`: both enable cross-origin +isolation (and thus SharedArrayBuffer / threads), but `credentialless` still lets +no-cors cross-origin assets load (e.g. the sponsor badge), instead of blocking +anything that doesn't send a CORP header. Supported in Firefox 119+ / Chrome 110+. + +Run this from the demo directory: + + python serve.py # serves on http://localhost:8080 + python serve.py 8080 # custom port + +After loading the page, confirm in the browser console: + + crossOriginIsolated === true + +If that is false, the headers are not reaching the browser and inference will +stay single-threaded. +""" + +import sys +from functools import partial +from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer + + +class CrossOriginIsolatedHandler(SimpleHTTPRequestHandler): + def end_headers(self): + self.send_header("Cross-Origin-Opener-Policy", "same-origin") + self.send_header("Cross-Origin-Embedder-Policy", "credentialless") + # Avoid stale assets while developing. + self.send_header("Cache-Control", "no-store") + super().end_headers() + + +def main(): + port = int(sys.argv[1]) if len(sys.argv) > 1 else 8080 + handler = partial(CrossOriginIsolatedHandler, directory=".") + with ThreadingHTTPServer(("0.0.0.0", port), handler) as httpd: + print(f"Serving cross-origin-isolated demo on http://localhost:{port}") + print("Confirm `crossOriginIsolated === true` in the browser console.") + try: + httpd.serve_forever() + except KeyboardInterrupt: + print("\nStopped.") + + +if __name__ == "__main__": + main() From 61761ceaa81926ec168cfa8cba6537c9a366e60b Mon Sep 17 00:00:00 2001 From: Ivan Date: Tue, 9 Jun 2026 08:08:38 +0200 Subject: [PATCH 2/8] working per sequence --- resources/demo.js | 119 +++++++++++++++++++++++++++++++++---- resources/piper.js | 144 ++++++++++++++++++++++++++++++++------------- 2 files changed, 210 insertions(+), 53 deletions(-) diff --git a/resources/demo.js b/resources/demo.js index 30d53106..237fe3e5 100644 --- a/resources/demo.js +++ b/resources/demo.js @@ -1,9 +1,56 @@ -import { setVoice, textToWavAudio } from "./piper.js"; +import { + setVoice, + textToAudioSentences, + float32ToWavBlob, + getSampleRate, +} from "./piper.js"; let voiceUrl = ""; let loadedVoiceUrl = ""; let voiceConfigUrl = ""; +// Silence inserted between sentences, both for live playback scheduling and in the +// assembled WAV so replay matches the stream. Tune to taste. +const SENTENCE_GAP_SECONDS = 0.2; + +// Web Audio playback state (created lazily on first user gesture, reused after). +let audioCtx = null; +// Bumped on each Speak click so an in-flight stream knows to abort. +let playbackGeneration = 0; +// Source nodes scheduled for the current playback, so we can stop them on re-click. +let activeSources = []; + +function stopPlayback() { + for (const src of activeSources) { + try { + src.stop(); + } catch { + // Already stopped/ended. + } + } + activeSources = []; +} + +// Concatenate per-sentence Float32 chunks with SENTENCE_GAP_SECONDS of silence +// between them, matching what was played live. +function joinWithGaps(chunks, sampleRate) { + const gapSamples = Math.round(SENTENCE_GAP_SECONDS * sampleRate); + const total = + chunks.reduce((n, c) => n + c.length, 0) + + gapSamples * Math.max(0, chunks.length - 1); + + const out = new Float32Array(total); + let offset = 0; + chunks.forEach((chunk, i) => { + out.set(chunk, offset); + offset += chunk.length; + if (i < chunks.length - 1) { + offset += gapSamples; // leave zeros (silence) + } + }); + return out; +} + async function main() { const fileModel = document.getElementById("fileModel"); const fileConfig = document.getElementById("fileConfig"); @@ -97,18 +144,68 @@ async function main() { noiseWScale = null; } + // Stop any in-progress playback and mark this as the current generation. + const generation = ++playbackGeneration; + stopPlayback(); + + if (!audioCtx) { + audioCtx = new AudioContext(); + } + await audioCtx.resume(); // requires a user gesture, which this click is + + const sampleRate = getSampleRate(); + const chunks = []; + let nextStartTime = 0; + status.innerHTML = "Synthesizing audio..."; - const wavAudio = await textToWavAudio( - text, - speakerId, - lengthScale, - noiseScale, - noiseWScale, - ); - const audioURL = URL.createObjectURL(wavAudio); + try { + for await (const audio of textToAudioSentences( + text, + speakerId, + lengthScale, + noiseScale, + noiseWScale, + )) { + // A newer click superseded us while we were synthesizing. + if (generation !== playbackGeneration) { + return; + } + + chunks.push(audio); + + // Schedule this sentence to play right after the previous one. + const buffer = audioCtx.createBuffer(1, audio.length, sampleRate); + buffer.copyToChannel(audio, 0); + const source = audioCtx.createBufferSource(); + source.buffer = buffer; + source.connect(audioCtx.destination); + + if (nextStartTime === 0) { + nextStartTime = audioCtx.currentTime + 0.1; // small lead-in + status.innerHTML = "Playing..."; + } + // Never schedule in the past: a slow synth yields a gap, not an overlap. + nextStartTime = Math.max(nextStartTime, audioCtx.currentTime); + source.start(nextStartTime); + nextStartTime += buffer.duration + SENTENCE_GAP_SECONDS; - audioTTS.src = audioURL; - audioTTS.play(); + activeSources.push(source); + } + } catch (e) { + status.innerHTML = "Error"; + throw e; + } + + if (generation !== playbackGeneration) { + return; + } + + // Hybrid: assemble the full WAV so the