Web Speech API Implementation (Speech Recognition and Synthesis)
Web Speech API consists of two independent parts: SpeechRecognition (speech → text) and SpeechSynthesis (text → speech). The first is needed for voice control, dictation, voice search. The second is for content voicing, notifications, accessibility.
Browser Support
SpeechRecognition: Chrome/Edge (with webkit prefix), Android Chrome. Firefox and Safari — not supported. For production, fallback to server-side ASR (Whisper/Deepgram) is needed.
SpeechSynthesis: all modern browsers, including Safari iOS.
Speech Recognition
const SpeechRecognition =
window.SpeechRecognition || (window as any).webkitSpeechRecognition
interface UseSpeechRecognitionOptions {
lang?: string
continuous?: boolean // Continuous recording vs single phrase
interimResults?: boolean // Intermediate results in real-time
onResult: (transcript: string, isFinal: boolean) => void
onError?: (error: string) => void
}
function useSpeechRecognition({
lang = 'en-US',
continuous = false,
interimResults = true,
onResult,
onError,
}: UseSpeechRecognitionOptions) {
const recognitionRef = useRef<SpeechRecognition | null>(null)
const [isListening, setIsListening] = useState(false)
const [isSupported] = useState(() => 'SpeechRecognition' in window || 'webkitSpeechRecognition' in window)
function start() {
if (!isSupported) {
onError?.('Browser does not support speech recognition')
return
}
const recognition = new SpeechRecognition()
recognition.lang = lang
recognition.continuous = continuous
recognition.interimResults = interimResults
recognition.maxAlternatives = 1
recognition.onstart = () => setIsListening(true)
recognition.onend = () => setIsListening(false)
recognition.onresult = (event: SpeechRecognitionEvent) => {
let finalTranscript = ''
let interimTranscript = ''
for (let i = event.resultIndex; i < event.results.length; i++) {
const transcript = event.results[i][0].transcript
if (event.results[i].isFinal) {
finalTranscript += transcript
} else {
interimTranscript += transcript
}
}
if (finalTranscript) {
onResult(finalTranscript.trim(), true)
} else if (interimTranscript) {
onResult(interimTranscript.trim(), false)
}
}
recognition.onerror = (event: SpeechRecognitionErrorEvent) => {
const messages: Record<string, string> = {
'not-allowed': 'Microphone access denied',
'no-speech': 'Speech not detected',
'network': 'Network error during recognition',
'audio-capture': 'Microphone unavailable',
}
onError?.(messages[event.error] ?? event.error)
setIsListening(false)
}
recognitionRef.current = recognition
recognition.start()
}
function stop() {
recognitionRef.current?.stop()
recognitionRef.current = null
}
return { isListening, isSupported, start, stop }
}
Voice Dictation Component
function VoiceDictation({ onChange }: { onChange: (text: string) => void }) {
const [transcript, setTranscript] = useState('')
const [interim, setInterim] = useState('')
const { isListening, isSupported, start, stop } = useSpeechRecognition({
lang: 'en-US',
continuous: true,
interimResults: true,
onResult: (text, isFinal) => {
if (isFinal) {
setTranscript((prev) => {
const next = prev + (prev ? ' ' : '') + text
onChange(next)
return next
})
setInterim('')
} else {
setInterim(text)
}
},
onError: (err) => console.warn('Speech error:', err),
})
if (!isSupported) {
return <p className="text-sm text-gray-500">Voice input not available in this browser</p>
}
return (
<div className="border rounded-lg p-3">
<div className="min-h-[80px] text-sm">
<span>{transcript}</span>
{interim && <span className="text-gray-400 italic"> {interim}</span>}
</div>
<div className="flex gap-2 mt-2 border-t pt-2">
<button
onClick={isListening ? stop : start}
className={`flex items-center gap-2 px-3 py-1.5 rounded text-sm ${
isListening
? 'bg-red-100 text-red-700'
: 'bg-blue-100 text-blue-700'
}`}
>
{isListening ? (
<>
<span className="w-2 h-2 bg-red-500 rounded-full animate-pulse" />
Stop
</>
) : (
'Speak'
)}
</button>
<button
onClick={() => { setTranscript(''); setInterim(''); onChange('') }}
className="text-sm text-gray-500 hover:text-gray-700"
>
Clear
</button>
</div>
</div>
)
}
Voice Commands
function useVoiceCommands(commands: Record<string, () => void>) {
const { start, stop, isListening } = useSpeechRecognition({
lang: 'en-US',
continuous: true,
interimResults: false,
onResult: (transcript) => {
const lower = transcript.toLowerCase().trim()
for (const [phrase, action] of Object.entries(commands)) {
if (lower.includes(phrase)) {
action()
break
}
}
},
})
return { start, stop, isListening }
}
// Usage
const { start } = useVoiceCommands({
'next slide': () => goToSlide(current + 1),
'previous slide': () => goToSlide(current - 1),
'first slide': () => goToSlide(0),
'fullscreen': () => document.documentElement.requestFullscreen(),
})
Text-to-Speech
class TextToSpeech {
private synth = window.speechSynthesis
private currentUtterance: SpeechSynthesisUtterance | null = null
speak(text: string, options: {
lang?: string
rate?: number // 0.1–10, default 1
pitch?: number // 0–2, default 1
volume?: number // 0–1
voiceName?: string
onEnd?: () => void
} = {}) {
this.stop()
const utterance = new SpeechSynthesisUtterance(text)
utterance.lang = options.lang ?? 'en-US'
utterance.rate = options.rate ?? 1
utterance.pitch = options.pitch ?? 1
utterance.volume = options.volume ?? 1
if (options.voiceName) {
const voices = this.synth.getVoices()
const voice = voices.find((v) => v.name === options.voiceName)
if (voice) utterance.voice = voice
}
if (options.onEnd) utterance.onend = options.onEnd
// Chrome workaround: long text gets cut off around 15 seconds
utterance.onboundary = (event) => {
if (event.name === 'sentence') {
// Periodically "wake" synthesizer
this.synth.pause()
this.synth.resume()
}
}
this.currentUtterance = utterance
this.synth.speak(utterance)
}
stop() {
this.synth.cancel()
this.currentUtterance = null
}
pause() { this.synth.pause() }
resume() { this.synth.resume() }
getVoices(): SpeechSynthesisVoice[] {
return this.synth.getVoices().filter((v) => v.lang.startsWith('en'))
}
}
Fallback: Whisper API for Serious ASR
When browser ASR is insufficient (low quality, Firefox/Safari support):
async function transcribeWithWhisper(audioBlob: Blob): Promise<string> {
const formData = new FormData()
formData.append('file', audioBlob, 'audio.webm')
formData.append('model', 'whisper-1')
formData.append('language', 'en')
const response = await fetch('https://api.openai.com/v1/audio/transcriptions', {
method: 'POST',
headers: { Authorization: `Bearer ${process.env.OPENAI_API_KEY}` },
body: formData,
})
const data = await response.json()
return data.text
}
Record via MediaRecorder API, send to /api/transcribe which proxies to Whisper — keeps key off frontend.
What We Do
Determine scenario: voice search, dictation, control commands, TTS for accessibility. Implement corresponding API part, add fallback (Whisper for ASR, browser TTS works everywhere). Test on various browsers, account for autoplay policy.
Timeline: voice search or dictation — 1–2 days. Voice commands + TTS — 2–3 days. With Whisper fallback — plus 1 day.







