Speech websocket
Real-time medical speech-to-text transcription over WebSocket using audio streaming.
Overview
Stream audio from doctor-patient conversations to receive real-time transcription and trigger Avey's medical AI models to accurately extract structured subjective and objective clinical findings.
Connection details
WebSocket URL
wss://api.avey.ai/cowriter/v1/session/{session_id}/transcribe
Supported Audio Format
pcm_f32le
Path Parameters
Prop
Type
Lifecycle
Client-to-Server messages
Upon connection, the client streams audio data as pcm_f32le
encoded binary chunks using Float32Array
format.
Server-to-Client messages
The server sends transcription results as JSON objects following this schema:
Prop
Type
Message flow
Note: Ensure you send audio data in the correct binary format (Float32Array
).
Audio implementation
This web implementation example shows how to capture and send audio data to the WebSocket in React. You'll need two components: an AudioWorklet processor for low-latency background audio processing and a React hook to manage the audio context lifecycle. These work together to capture microphone input, downsample it to 16kHz, and post pcm_f32le chunks ready for speech recognition.
Audio processor
This AudioWorklet processor runs in a separate audio thread, ensuring minimal latency and smooth audio capture:
// audio-processor.js
class AudioProcessor extends AudioWorkletProcessor {
constructor() {
super();
this.desiredSampleRate = 16000; // Desired output sample rate
this.bufferSize = 2048;
this.audioBuffer = new Float32Array(this.bufferSize);
this.bufferIndex = 0;
}
// process() is called on every audio chunk received from the browser.
// For recording, we focus on the 'inputs'.
process(inputs, outputs, parameters) {
// inputs[0] refers to the first input connected to the node.
// inputs[0][0] refers to the first channel of the first input.
const inputChannelData = inputs[0]?.[0]; // Use optional chaining for safety
// Check if there's valid input data
if (inputChannelData instanceof Float32Array && inputChannelData.length > 0) {
// * For each audio data chunk we receive, we will store it in the audioBuffer until full
// * Once full, we will send it to the main thread for processing
// * and any remaining data will be stored in the audioBuffer for the next iteration
let remainingData = inputChannelData;
while (remainingData.length > 0) {
// Determine how much space is left in the buffer
const spaceInBuffer = this.bufferSize - this.bufferIndex;
// Take only the portion that fits in the current buffer
const chunk = remainingData.slice(0, spaceInBuffer);
// Append the chunk to the internal audio buffer
this.audioBuffer.set(chunk, this.bufferIndex);
this.bufferIndex += chunk.length;
// If the buffer is full, downsample and send it to the main thread
if (this.bufferIndex === this.bufferSize) {
const downsampled = this.downsampleBuffer(this.audioBuffer, sampleRate, this.desiredSampleRate);
// Send the downsampled buffer to the main thread
// Transfer the underlying ArrayBuffer to avoid copying
// This is crucial for performance, as it avoids copying the buffer
this.port.postMessage(downsampled, [downsampled.buffer]);
// Reset buffer index for next batch
this.bufferIndex = 0;
}
// Remove the processed chunk from the remaining data
remainingData = remainingData.slice(chunk.length);
}
}
// Return true to keep the processor alive.
// If you return false, it will be garbage collected.
return true;
}
downsampleBuffer(buffer, inputSampleRate, outputSampleRate) {
if (inputSampleRate === outputSampleRate) {
// If no resampling is needed, return a copy of the buffer
return new Float32Array(buffer);
}
if (inputSampleRate < outputSampleRate) {
throw new Error("Input sample rate must be greater than output sample rate");
}
const sampleRateRatio = inputSampleRate / outputSampleRate;
const outputLength = Math.floor(buffer.length / sampleRateRatio);
const result = new Float32Array(outputLength);
let outputIndex = 0;
let inputIndex = 0;
while (outputIndex < outputLength) {
// Calculate the precise fractional index in the input buffer
const theoreticalInputIndex = inputIndex;
// Find the two nearest input samples
const index1 = Math.floor(theoreticalInputIndex);
const index2 = index1 + 1;
// Calculate the interpolation fraction
const fraction = theoreticalInputIndex - index1;
// Get the input sample values, handle edge case for the last sample
const value1 = buffer[index1];
const value2 = index2 < buffer.length ? buffer[index2] : buffer[index1]; // Use last sample if index2 is out of bounds
// Linear interpolation
result[outputIndex] = value1 + (value2 - value1) * fraction;
// Move to the next output sample position in the input buffer space
inputIndex += sampleRateRatio;
outputIndex++;
}
return result;
}
}
registerProcessor("audio-processor", AudioProcessor);
Important: This file must be vanilla JavaScript (not TypeScript) and placed in your public/
directory.
React hook
This hook integrates the audio processor with your React application, handling microphone permissions and providing audio data via a callback:
import { useRef, useState } from "react";
/**
* Custom hook for processing audio using AudioWorklet.
*
* This hook captures microphone audio, processes it with an AudioWorkletProcessor,
* and sends the processed audio data to a WebSocket connection.
*/
type AudioProcessor = {
isStreaming: boolean;
isPaused: boolean;
startProcessing: () => Promise<void>;
stopProcessing: () => void;
toggleProcessing: () => void;
};
const useAudioProcessor = (onAudio: (data: Float32Array) => void): AudioProcessor => {
const audioContextRef = useRef<AudioContext | null>(null);
const workletNodeRef = useRef<AudioWorkletNode | null>(null);
const sourceNodeRef = useRef<MediaStreamAudioSourceNode | null>(null);
const mediaStreamRef = useRef<MediaStream | null>(null);
const [isStreaming, setIsStreaming] = useState(false);
const [isPaused, setIsPaused] = useState(false);
const startProcessing = async () => {
if (isStreaming && !isPaused) return;
try {
// Create or reuse AudioContext and stream
const audioContext = audioContextRef.current || new AudioContext();
audioContextRef.current = audioContext;
const stream = mediaStreamRef.current || (await navigator.mediaDevices.getUserMedia({ audio: true }));
mediaStreamRef.current = stream;
// Create or reuse worklet node
if (!workletNodeRef.current) {
await audioContext.audioWorklet.addModule("/audio-processor.js");
const workletNode = new AudioWorkletNode(audioContext, "audio-processor");
workletNode.port.onmessage = (event) => onAudio(event.data);
workletNodeRef.current = workletNode;
}
// Create or reuse source node
if (!sourceNodeRef.current) {
sourceNodeRef.current = audioContext.createMediaStreamSource(stream);
}
// Connect nodes
sourceNodeRef.current.connect(workletNodeRef.current);
workletNodeRef.current.connect(audioContext.destination);
setIsStreaming(true);
setIsPaused(false);
} catch (error) {
setIsStreaming(false);
setIsPaused(false);
throw new Error("Unknown error setting up audio processing: " + error);
}
};
const stopProcessing = () => {
if (workletNodeRef.current) {
workletNodeRef.current.disconnect();
workletNodeRef.current = null;
}
if (sourceNodeRef.current) {
sourceNodeRef.current.disconnect();
sourceNodeRef.current = null;
}
if (audioContextRef.current) {
audioContextRef.current.close();
audioContextRef.current = null;
}
if (mediaStreamRef.current) {
mediaStreamRef.current.getTracks().forEach((track) => track.stop());
mediaStreamRef.current = null;
}
setIsStreaming(false);
setIsPaused(false);
};
const toggleProcessing = () => {
if (!isPaused) {
workletNodeRef.current?.disconnect();
sourceNodeRef.current?.disconnect();
setIsPaused(true);
} else {
if (sourceNodeRef.current && workletNodeRef.current && audioContextRef.current) {
sourceNodeRef.current.connect(workletNodeRef.current);
workletNodeRef.current.connect(audioContextRef.current.destination);
setIsPaused(false);
} else {
startProcessing();
}
}
};
return {
isStreaming,
isPaused,
startProcessing,
stopProcessing,
toggleProcessing,
};
};
export default useAudioProcessor;
Usage example
Here's how to use these components together:
import useAudioProcessor from './useAudioProcessor';
function SpeechRecognition() {
const sendAudioData = (audioData: Float32Array) => {
websocket.send(audioData);
};
const { isStreaming, startProcessing, stopProcessing } = useAudioProcessor(sendAudioData);
return (
<div>
<button onClick={startProcessing} disabled={isStreaming}>
Start Recording
</button>
<button onClick={stopProcessing} disabled={!isStreaming}>
Stop Recording
</button>
</div>
);
}
How is this guide?