Converting between audio and text in real-time using Deepgram and ElevenLabs.
Real-time transcription via WebSocket:
const WebSocket = require('ws');
class DeepgramSTT {
constructor(apiKey) {
this.apiKey = apiKey;
this.ws = null;
}
connect(onTranscript) {
const url = 'wss://api.deepgram.com/v1/listen?' + new URLSearchParams({
encoding: 'mulaw',
sample_rate: 8000,
channels: 1,
punctuate: true,
interim_results: true
});
this.ws = new WebSocket(url, {
headers: { Authorization: `Token ${this.apiKey}` }
});
this.ws.on('message', (data) => {
const response = JSON.parse(data);
const transcript = response.channel?.alternatives?.[0];
if (transcript?.transcript) {
onTranscript({
text: transcript.transcript,
isFinal: response.is_final,
confidence: transcript.confidence
});
}
});
}
sendAudio(audioBuffer) {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.send(audioBuffer);
}
}
close() {
this.ws?.close();
}
}
const https = require('https');
class DeepgramTTS {
constructor(apiKey) {
this.apiKey = apiKey;
}
async synthesize(text) {
const response = await fetch('https://api.deepgram.com/v1/speak?' + new URLSearchParams({
model: 'aura-asteria-en',
encoding: 'mulaw',
sample_rate: 8000
}), {
method: 'POST',
headers: {
'Authorization': `Token ${this.apiKey}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({ text })
});
return Buffer.from(await response.arrayBuffer());
}
}
Streaming TTS with ElevenLabs:
class ElevenLabsTTS {
constructor(apiKey, voiceId) {
this.apiKey = apiKey;
this.voiceId = voiceId;
}
async* streamSpeech(text) {
const response = await fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${this.voiceId}/stream`,
{
method: 'POST',
headers: {
'xi-api-key': this.apiKey,
'Content-Type': 'application/json'
},
body: JSON.stringify({
text: text,
model_id: 'eleven_monolingual_v1',
output_format: 'ulaw_8000'
})
}
);
const reader = response.body.getReader();
while (true) {
const { done, value } = await reader.read();
if (done) break;
yield value;
}
}
}
Twilio uses mulaw 8kHz. Convert if needed:
// PCM to mulaw conversion table
function linearToMulaw(sample) {
const MULAW_MAX = 0x1FFF;
const MULAW_BIAS = 33;
let sign = (sample >> 8) & 0x80;
if (sign) sample = -sample;
sample += MULAW_BIAS;
if (sample > MULAW_MAX) sample = MULAW_MAX;
let exponent = Math.floor(Math.log2(sample)) - 5;
if (exponent < 0) exponent = 0;
let mantissa = (sample >> (exponent + 3)) & 0x0F;
let mulaw = ~(sign | (exponent << 4) | mantissa);
return mulaw & 0xFF;
}
Use interim results for faster response:
class TranscriptionHandler {
constructor() {
this.lastInterim = '';
}
onTranscript({ text, isFinal }) {
if (isFinal) {
// Final result - process it
this.processText(text);
this.lastInterim = '';
} else {
// Interim result - could use for interruption detection
if (text.length > this.lastInterim.length + 10) {
// User is still speaking
this.onUserSpeaking();
}
this.lastInterim = text;
}
}
}
| Voice | Description |
|---|---|
aura-asteria-en | Female, American |
aura-luna-en | Female, American |
aura-stella-en | Female, American |
aura-orion-en | Male, American |
aura-arcas-en | Male, American |
Get voice IDs from the ElevenLabs Voice Library or API:
curl -X GET "https://api.elevenlabs.io/v1/voices" \
-H "xi-api-key: YOUR_API_KEY"
// Send to TTS as soon as we have a sentence
let buffer = '';
for await (const token of llmStream) {
buffer += token;
// Check for sentence boundary
const match = buffer.match(/^(.+?[.!?•])\s*/);
if (match) {
tts.speak(match[1]);
buffer = buffer.slice(match[0].length);
}
}
// Don't forget remaining text
if (buffer.trim()) {
tts.speak(buffer);
}
class ResilientSTT {
constructor(apiKey) {
this.apiKey = apiKey;
this.reconnectAttempts = 0;
}
connect(onTranscript) {
// ... connection code ...
this.ws.on('error', (error) => {
console.error('STT error:', error);
this.reconnect(onTranscript);
});
this.ws.on('close', () => {
this.reconnect(onTranscript);
});
}
reconnect(onTranscript) {
if (this.reconnectAttempts < 5) {
this.reconnectAttempts++;
setTimeout(() => this.connect(onTranscript), 1000);
}
}
}