Voice AI

Speech Services

Speech-to-Text and Text-to-Speech with Deepgram and ElevenLabs

Speech Services

Converting between audio and text in real-time using Deepgram and ElevenLabs.

Deepgram STT (Speech-to-Text)

Real-time transcription via WebSocket:

const WebSocket = require('ws');

class DeepgramSTT {
  constructor(apiKey) {
    this.apiKey = apiKey;
    this.ws = null;
  }
  
  connect(onTranscript) {
    const url = 'wss://api.deepgram.com/v1/listen?' + new URLSearchParams({
      encoding: 'mulaw',
      sample_rate: 8000,
      channels: 1,
      punctuate: true,
      interim_results: true
    });
    
    this.ws = new WebSocket(url, {
      headers: { Authorization: `Token ${this.apiKey}` }
    });
    
    this.ws.on('message', (data) => {
      const response = JSON.parse(data);
      const transcript = response.channel?.alternatives?.[0];
      
      if (transcript?.transcript) {
        onTranscript({
          text: transcript.transcript,
          isFinal: response.is_final,
          confidence: transcript.confidence
        });
      }
    });
  }
  
  sendAudio(audioBuffer) {
    if (this.ws?.readyState === WebSocket.OPEN) {
      this.ws.send(audioBuffer);
    }
  }
  
  close() {
    this.ws?.close();
  }
}

Deepgram TTS (Text-to-Speech)

const https = require('https');

class DeepgramTTS {
  constructor(apiKey) {
    this.apiKey = apiKey;
  }
  
  async synthesize(text) {
    const response = await fetch('https://api.deepgram.com/v1/speak?' + new URLSearchParams({
      model: 'aura-asteria-en',
      encoding: 'mulaw',
      sample_rate: 8000
    }), {
      method: 'POST',
      headers: {
        'Authorization': `Token ${this.apiKey}`,
        'Content-Type': 'application/json'
      },
      body: JSON.stringify({ text })
    });
    
    return Buffer.from(await response.arrayBuffer());
  }
}

ElevenLabs TTS

Streaming TTS with ElevenLabs:

class ElevenLabsTTS {
  constructor(apiKey, voiceId) {
    this.apiKey = apiKey;
    this.voiceId = voiceId;
  }
  
  async* streamSpeech(text) {
    const response = await fetch(
      `https://api.elevenlabs.io/v1/text-to-speech/${this.voiceId}/stream`,
      {
        method: 'POST',
        headers: {
          'xi-api-key': this.apiKey,
          'Content-Type': 'application/json'
        },
        body: JSON.stringify({
          text: text,
          model_id: 'eleven_monolingual_v1',
          output_format: 'ulaw_8000'
        })
      }
    );
    
    const reader = response.body.getReader();
    while (true) {
      const { done, value } = await reader.read();
      if (done) break;
      yield value;
    }
  }
}

Audio Format Conversion

Twilio uses mulaw 8kHz. Convert if needed:

// PCM to mulaw conversion table
function linearToMulaw(sample) {
  const MULAW_MAX = 0x1FFF;
  const MULAW_BIAS = 33;
  
  let sign = (sample >> 8) & 0x80;
  if (sign) sample = -sample;
  sample += MULAW_BIAS;
  
  if (sample > MULAW_MAX) sample = MULAW_MAX;
  
  let exponent = Math.floor(Math.log2(sample)) - 5;
  if (exponent < 0) exponent = 0;
  
  let mantissa = (sample >> (exponent + 3)) & 0x0F;
  let mulaw = ~(sign | (exponent << 4) | mantissa);
  
  return mulaw & 0xFF;
}

Handling Interim Results

Use interim results for faster response:

class TranscriptionHandler {
  constructor() {
    this.lastInterim = '';
  }
  
  onTranscript({ text, isFinal }) {
    if (isFinal) {
      // Final result - process it
      this.processText(text);
      this.lastInterim = '';
    } else {
      // Interim result - could use for interruption detection
      if (text.length > this.lastInterim.length + 10) {
        // User is still speaking
        this.onUserSpeaking();
      }
      this.lastInterim = text;
    }
  }
}

Voice Selection

Deepgram Voices

Voice	Description
`aura-asteria-en`	Female, American
`aura-luna-en`	Female, American
`aura-stella-en`	Female, American
`aura-orion-en`	Male, American
`aura-arcas-en`	Male, American

ElevenLabs

Get voice IDs from the ElevenLabs Voice Library or API:

curl -X GET "https://api.elevenlabs.io/v1/voices" \
  -H "xi-api-key: YOUR_API_KEY"

Latency Optimization

Use streaming - Don't wait for complete responses
Chunk text - Send to TTS in small pieces
Pre-buffer - Start TTS before LLM finishes
Connection pooling - Reuse WebSocket connections

// Send to TTS as soon as we have a sentence
let buffer = '';

for await (const token of llmStream) {
  buffer += token;
  
  // Check for sentence boundary
  const match = buffer.match(/^(.+?[.!?•])\s*/);
  if (match) {
    tts.speak(match[1]);
    buffer = buffer.slice(match[0].length);
  }
}

// Don't forget remaining text
if (buffer.trim()) {
  tts.speak(buffer);
}

Error Handling

class ResilientSTT {
  constructor(apiKey) {
    this.apiKey = apiKey;
    this.reconnectAttempts = 0;
  }
  
  connect(onTranscript) {
    // ... connection code ...
    
    this.ws.on('error', (error) => {
      console.error('STT error:', error);
      this.reconnect(onTranscript);
    });
    
    this.ws.on('close', () => {
      this.reconnect(onTranscript);
    });
  }
  
  reconnect(onTranscript) {
    if (this.reconnectAttempts < 5) {
      this.reconnectAttempts++;
      setTimeout(() => this.connect(onTranscript), 1000);
    }
  }
}

Edit this pageorReport an issue

LLM Integration

OpenAI and Claude for conversational AI

Overview

AI agents are autonomous systems that can perceive their environment, make decisions, and take actions to achieve specific goals. This section covers various agent architectures and patterns.