Speech websocket

Real-time medical speech-to-text transcription over WebSocket using audio streaming.

Overview

Stream audio from doctor-patient conversations to receive real-time transcription and trigger Avey's medical AI models to accurately extract structured subjective and objective clinical findings.

Lifecycle

Client-to-Server messages

Upon connection, the client streams audio data as pcm_f32le encoded binary chunks using Float32Array format.

Server-to-Client messages

The server sends transcription results as JSON objects following this schema:

Prop

Type

Message flow

Note: Ensure you send audio data in the correct binary format (Float32Array).

This web implementation example shows how to capture and send audio data to the WebSocket in React. You'll need two components: an AudioWorklet processor for low-latency background audio processing and a React hook to manage the audio context lifecycle. These work together to capture microphone input, downsample it to 16kHz, and post pcm_f32le chunks ready for speech recognition.

Audio processor

This AudioWorklet processor runs in a separate audio thread, ensuring minimal latency and smooth audio capture:

audio-processor.js

// audio-processor.js
class AudioProcessor extends AudioWorkletProcessor {
  constructor() {
    super();
    this.desiredSampleRate = 16000; // Desired output sample rate
    this.bufferSize = 2048;
    this.audioBuffer = new Float32Array(this.bufferSize);
    this.bufferIndex = 0;
  }

  // process() is called on every audio chunk received from the browser.
  // For recording, we focus on the 'inputs'.
  process(inputs, outputs, parameters) {
    // inputs[0] refers to the first input connected to the node.
    // inputs[0][0] refers to the first channel of the first input.
    const inputChannelData = inputs[0]?.[0]; // Use optional chaining for safety

    // Check if there's valid input data
    if (inputChannelData instanceof Float32Array && inputChannelData.length > 0) {
      // * For each audio data chunk we receive, we will store it in the audioBuffer until full
      // * Once full, we will send it to the main thread for processing
      // * and any remaining data will be stored in the audioBuffer for the next iteration

      let remainingData = inputChannelData;
      while (remainingData.length > 0) {
        // Determine how much space is left in the buffer
        const spaceInBuffer = this.bufferSize - this.bufferIndex;

        // Take only the portion that fits in the current buffer
        const chunk = remainingData.slice(0, spaceInBuffer);

        // Append the chunk to the internal audio buffer
        this.audioBuffer.set(chunk, this.bufferIndex);
        this.bufferIndex += chunk.length;

        // If the buffer is full, downsample and send it to the main thread
        if (this.bufferIndex === this.bufferSize) {
          const downsampled = this.downsampleBuffer(this.audioBuffer, sampleRate, this.desiredSampleRate);

          // Send the downsampled buffer to the main thread
          // Transfer the underlying ArrayBuffer to avoid copying
          // This is crucial for performance, as it avoids copying the buffer
          this.port.postMessage(downsampled, [downsampled.buffer]);

          // Reset buffer index for next batch
          this.bufferIndex = 0;
        }

        // Remove the processed chunk from the remaining data
        remainingData = remainingData.slice(chunk.length);
      }
    }

    // Return true to keep the processor alive.
    // If you return false, it will be garbage collected.
    return true;
  }

  downsampleBuffer(buffer, inputSampleRate, outputSampleRate) {
    if (inputSampleRate === outputSampleRate) {
      // If no resampling is needed, return a copy of the buffer
      return new Float32Array(buffer);
    }
    if (inputSampleRate < outputSampleRate) {
      throw new Error("Input sample rate must be greater than output sample rate");
    }

    const sampleRateRatio = inputSampleRate / outputSampleRate;
    const outputLength = Math.floor(buffer.length / sampleRateRatio);
    const result = new Float32Array(outputLength);
    let outputIndex = 0;
    let inputIndex = 0;

    while (outputIndex < outputLength) {
      // Calculate the precise fractional index in the input buffer
      const theoreticalInputIndex = inputIndex;

      // Find the two nearest input samples
      const index1 = Math.floor(theoreticalInputIndex);
      const index2 = index1 + 1;

      // Calculate the interpolation fraction
      const fraction = theoreticalInputIndex - index1;

      // Get the input sample values, handle edge case for the last sample
      const value1 = buffer[index1];
      const value2 = index2 < buffer.length ? buffer[index2] : buffer[index1]; // Use last sample if index2 is out of bounds

      // Linear interpolation
      result[outputIndex] = value1 + (value2 - value1) * fraction;

      // Move to the next output sample position in the input buffer space
      inputIndex += sampleRateRatio;
      outputIndex++;
    }

    return result;
  }
}

registerProcessor("audio-processor", AudioProcessor);

Important: This file must be vanilla JavaScript (not TypeScript) and placed in your public/ directory.

React hook

This hook integrates the audio processor with your React application, handling microphone permissions and providing audio data via a callback:

useAudioProcessor.ts

import { useRef, useState } from "react";

/**
 * Custom hook for processing audio using AudioWorklet.
 *
 * This hook captures microphone audio, processes it with an AudioWorkletProcessor,
 * and sends the processed audio data to a WebSocket connection.
 */

type AudioProcessor = {
  isStreaming: boolean;
  isPaused: boolean;
  startProcessing: () => Promise<void>;
  stopProcessing: () => void;
  toggleProcessing: () => void;
};
const useAudioProcessor = (onAudio: (data: Float32Array) => void): AudioProcessor => {
  const audioContextRef = useRef<AudioContext | null>(null);
  const workletNodeRef = useRef<AudioWorkletNode | null>(null);
  const sourceNodeRef = useRef<MediaStreamAudioSourceNode | null>(null);
  const mediaStreamRef = useRef<MediaStream | null>(null);
  const [isStreaming, setIsStreaming] = useState(false);
  const [isPaused, setIsPaused] = useState(false);

  const startProcessing = async () => {
    if (isStreaming && !isPaused) return;

    try {
      // Create or reuse AudioContext and stream
      const audioContext = audioContextRef.current || new AudioContext();
      audioContextRef.current = audioContext;

      const stream = mediaStreamRef.current || (await navigator.mediaDevices.getUserMedia({ audio: true }));
      mediaStreamRef.current = stream;

      // Create or reuse worklet node
      if (!workletNodeRef.current) {
        await audioContext.audioWorklet.addModule("/audio-processor.js");
        const workletNode = new AudioWorkletNode(audioContext, "audio-processor");
        workletNode.port.onmessage = (event) => onAudio(event.data);
        workletNodeRef.current = workletNode;
      }

      // Create or reuse source node
      if (!sourceNodeRef.current) {
        sourceNodeRef.current = audioContext.createMediaStreamSource(stream);
      }

      // Connect nodes
      sourceNodeRef.current.connect(workletNodeRef.current);
      workletNodeRef.current.connect(audioContext.destination);

      setIsStreaming(true);
      setIsPaused(false);
    } catch (error) {
      setIsStreaming(false);
      setIsPaused(false);
      throw new Error("Unknown error setting up audio processing: " + error);
    }
  };

  const stopProcessing = () => {
    if (workletNodeRef.current) {
      workletNodeRef.current.disconnect();
      workletNodeRef.current = null;
    }

    if (sourceNodeRef.current) {
      sourceNodeRef.current.disconnect();
      sourceNodeRef.current = null;
    }

    if (audioContextRef.current) {
      audioContextRef.current.close();
      audioContextRef.current = null;
    }

    if (mediaStreamRef.current) {
      mediaStreamRef.current.getTracks().forEach((track) => track.stop());
      mediaStreamRef.current = null;
    }

    setIsStreaming(false);
    setIsPaused(false);
  };

  const toggleProcessing = () => {
    if (!isPaused) {
      workletNodeRef.current?.disconnect();
      sourceNodeRef.current?.disconnect();
      setIsPaused(true);
    } else {
      if (sourceNodeRef.current && workletNodeRef.current && audioContextRef.current) {
        sourceNodeRef.current.connect(workletNodeRef.current);
        workletNodeRef.current.connect(audioContextRef.current.destination);
        setIsPaused(false);
      } else {
        startProcessing();
      }
    }
  };

  return {
    isStreaming,
    isPaused,
    startProcessing,
    stopProcessing,
    toggleProcessing,
  };
};

export default useAudioProcessor;

Usage example

Here's how to use these components together:

Example Usage

import useAudioProcessor from './useAudioProcessor';

function SpeechRecognition() {
  const sendAudioData = (audioData: Float32Array) => {
    websocket.send(audioData);
  };

  const { isStreaming, startProcessing, stopProcessing } = useAudioProcessor(sendAudioData);

  return (
    <div>
      <button onClick={startProcessing} disabled={isStreaming}>
        Start Recording
      </button>
      <button onClick={stopProcessing} disabled={!isStreaming}>
        Stop Recording
      </button>
    </div>
  );
}

Speech websocket

Overview

Connection details

WebSocket URL

Supported Audio Format

Path Parameters