Voice AI

Speech Services

Speech-to-Text and Text-to-Speech with Deepgram and ElevenLabs

Speech Services

Converting between audio and text in real-time using Deepgram and ElevenLabs.

Deepgram STT (Speech-to-Text)

Real-time transcription via WebSocket:

const WebSocket = require('ws');

class DeepgramSTT {
  constructor(apiKey) {
    this.apiKey = apiKey;
    this.ws = null;
  }
  
  connect(onTranscript) {
    const url = 'wss://api.deepgram.com/v1/listen?' + new URLSearchParams({
      encoding: 'mulaw',
      sample_rate: 8000,
      channels: 1,
      punctuate: true,
      interim_results: true
    });
    
    this.ws = new WebSocket(url, {
      headers: { Authorization: `Token ${this.apiKey}` }
    });
    
    this.ws.on('message', (data) => {
      const response = JSON.parse(data);
      const transcript = response.channel?.alternatives?.[0];
      
      if (transcript?.transcript) {
        onTranscript({
          text: transcript.transcript,
          isFinal: response.is_final,
          confidence: transcript.confidence
        });
      }
    });
  }
  
  sendAudio(audioBuffer) {
    if (this.ws?.readyState === WebSocket.OPEN) {
      this.ws.send(audioBuffer);
    }
  }
  
  close() {
    this.ws?.close();
  }
}

Deepgram TTS (Text-to-Speech)

const https = require('https');

class DeepgramTTS {
  constructor(apiKey) {
    this.apiKey = apiKey;
  }
  
  async synthesize(text) {
    const response = await fetch('https://api.deepgram.com/v1/speak?' + new URLSearchParams({
      model: 'aura-asteria-en',
      encoding: 'mulaw',
      sample_rate: 8000
    }), {
      method: 'POST',
      headers: {
        'Authorization': `Token ${this.apiKey}`,
        'Content-Type': 'application/json'
      },
      body: JSON.stringify({ text })
    });
    
    return Buffer.from(await response.arrayBuffer());
  }
}

ElevenLabs TTS

Streaming TTS with ElevenLabs:

class ElevenLabsTTS {
  constructor(apiKey, voiceId) {
    this.apiKey = apiKey;
    this.voiceId = voiceId;
  }
  
  async* streamSpeech(text) {
    const response = await fetch(
      `https://api.elevenlabs.io/v1/text-to-speech/${this.voiceId}/stream`,
      {
        method: 'POST',
        headers: {
          'xi-api-key': this.apiKey,
          'Content-Type': 'application/json'
        },
        body: JSON.stringify({
          text: text,
          model_id: 'eleven_monolingual_v1',
          output_format: 'ulaw_8000'
        })
      }
    );
    
    const reader = response.body.getReader();
    while (true) {
      const { done, value } = await reader.read();
      if (done) break;
      yield value;
    }
  }
}

Audio Format Conversion

Twilio uses mulaw 8kHz. Convert if needed:

// PCM to mulaw conversion table
function linearToMulaw(sample) {
  const MULAW_MAX = 0x1FFF;
  const MULAW_BIAS = 33;
  
  let sign = (sample >> 8) & 0x80;
  if (sign) sample = -sample;
  sample += MULAW_BIAS;
  
  if (sample > MULAW_MAX) sample = MULAW_MAX;
  
  let exponent = Math.floor(Math.log2(sample)) - 5;
  if (exponent < 0) exponent = 0;
  
  let mantissa = (sample >> (exponent + 3)) & 0x0F;
  let mulaw = ~(sign | (exponent << 4) | mantissa);
  
  return mulaw & 0xFF;
}

Handling Interim Results

Use interim results for faster response:

class TranscriptionHandler {
  constructor() {
    this.lastInterim = '';
  }
  
  onTranscript({ text, isFinal }) {
    if (isFinal) {
      // Final result - process it
      this.processText(text);
      this.lastInterim = '';
    } else {
      // Interim result - could use for interruption detection
      if (text.length > this.lastInterim.length + 10) {
        // User is still speaking
        this.onUserSpeaking();
      }
      this.lastInterim = text;
    }
  }
}

Voice Selection

Deepgram Voices

VoiceDescription
aura-asteria-enFemale, American
aura-luna-enFemale, American
aura-stella-enFemale, American
aura-orion-enMale, American
aura-arcas-enMale, American

ElevenLabs

Get voice IDs from the ElevenLabs Voice Library or API:

curl -X GET "https://api.elevenlabs.io/v1/voices" \
  -H "xi-api-key: YOUR_API_KEY"

Latency Optimization

  1. Use streaming - Don't wait for complete responses
  2. Chunk text - Send to TTS in small pieces
  3. Pre-buffer - Start TTS before LLM finishes
  4. Connection pooling - Reuse WebSocket connections
// Send to TTS as soon as we have a sentence
let buffer = '';

for await (const token of llmStream) {
  buffer += token;
  
  // Check for sentence boundary
  const match = buffer.match(/^(.+?[.!?•])\s*/);
  if (match) {
    tts.speak(match[1]);
    buffer = buffer.slice(match[0].length);
  }
}

// Don't forget remaining text
if (buffer.trim()) {
  tts.speak(buffer);
}

Error Handling

class ResilientSTT {
  constructor(apiKey) {
    this.apiKey = apiKey;
    this.reconnectAttempts = 0;
  }
  
  connect(onTranscript) {
    // ... connection code ...
    
    this.ws.on('error', (error) => {
      console.error('STT error:', error);
      this.reconnect(onTranscript);
    });
    
    this.ws.on('close', () => {
      this.reconnect(onTranscript);
    });
  }
  
  reconnect(onTranscript) {
    if (this.reconnectAttempts < 5) {
      this.reconnectAttempts++;
      setTimeout(() => this.connect(onTranscript), 1000);
    }
  }
}