import asyncio
import websockets
import json
import base64
import os
async def transcribe_audio():
api_key = os.environ.get("TOGETHER_API_KEY")
url = "wss://api.together.ai/v1/realtime?model=openai/whisper-large-v3&input_audio_format=pcm_s16le_16000"
headers = {
"Authorization": f"Bearer {api_key}"
}
async with websockets.connect(url, additional_headers=headers) as ws:
# Read audio file
with open("audio.wav", "rb") as f:
audio_data = f.read()
# Send audio in chunks with delay to simulate real-time
chunk_size = 8192
bytes_per_second = 16000 * 2 # 16kHz * 2 bytes (16-bit)
delay_per_chunk = chunk_size / bytes_per_second
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i:i+chunk_size]
base64_chunk = base64.b64encode(chunk).decode('utf-8')
await ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": base64_chunk
}))
# Simulate real-time streaming
if i + chunk_size < len(audio_data):
await asyncio.sleep(delay_per_chunk)
# Commit the audio buffer
await ws.send(json.dumps({
"type": "input_audio_buffer.commit"
}))
# Receive transcription results
async for message in ws:
data = json.loads(message)
if data["type"] == "conversation.item.input_audio_transcription.delta":
print(f"Partial: {data['delta']}")
elif data["type"] == "conversation.item.input_audio_transcription.completed":
print(f"Final: {data['transcript']}")
break
elif data["type"] == "conversation.item.input_audio_transcription.failed":
error = data.get("error", {})
print(f"Error: {error.get('message')}")
break
asyncio.run(transcribe_audio())Establishes a WebSocket connection for real-time audio transcription. This endpoint uses WebSocket protocol (wss://api.together.ai/v1/realtime) for bidirectional streaming communication.
Connection Setup:
Client Events:
input_audio_buffer.append: Send audio chunks as base64-encoded data
{
"type": "input_audio_buffer.append",
"audio": "<base64_encoded_audio_chunk>"
}
input_audio_buffer.commit: Signal end of audio stream. When VAD is enabled, the server automatically detects speech boundaries and emits completed events. When VAD is disabled, you must send commit to trigger transcription of the buffered audio.
{
"type": "input_audio_buffer.commit"
}
transcription_session.updated: Update session configuration, including Voice Activity Detection (VAD) parameters. Send this after receiving session.created. Can also be sent at any time during the session to change VAD settings.
{
"type": "transcription_session.updated",
"session": {
"turn_detection": {
"type": "server_vad",
"threshold": 0.3,
"min_silence_duration_ms": 500,
"min_speech_duration_ms": 250,
"max_speech_duration_s": 5.0,
"speech_pad_ms": 250
}
}
}
To disable VAD entirely (manual commit mode), set turn_detection to null:
{
"type": "transcription_session.updated",
"session": {
"turn_detection": null
}
}
Voice Activity Detection (VAD)
VAD controls how the server automatically detects speech segments in the audio stream. When enabled (the default), the server uses Silero VAD to identify speech regions and emits transcription events as each segment completes. When disabled, you must manually call input_audio_buffer.commit to trigger transcription.
VAD can be configured in two ways:
turn_detection=server_vad&threshold=0.3&min_silence_duration_ms=500transcription_session.updated with a turn_detection object (see above)To disable VAD at connection time, use turn_detection=none as a query parameter.
VAD Parameters:
All parameters are optional — omitted fields use their defaults.
| Parameter | Type | Default | Description |
|---|---|---|---|
type | string | server_vad | VAD mode. Use server_vad to enable, or set turn_detection to null to disable. |
threshold | float | 0.3 | Speech probability threshold (0.0–1.0). Audio frames with probability above this value are classified as speech. Lower values detect more speech but may increase false positives. For low-SNR audio (e.g., 8kHz phone calls), values of 0.01–0.2 may work better. |
min_silence_duration_ms | int | 500 | Minimum silence duration in milliseconds before ending a speech segment. Higher values merge nearby speech bursts into single segments. For phone calls with mid-sentence pauses, 2000–5000ms prevents over-segmentation. |
min_speech_duration_ms | int | 250 | Minimum speech segment duration in milliseconds. Segments shorter than this are discarded. Filters out brief noise bursts or clicks. |
max_speech_duration_s | float | 5.0 | Maximum speech segment duration in seconds. Segments longer than this are force-split at the longest internal silence gap. Useful for continuous speech without natural pauses. |
speech_pad_ms | int | 250 | Padding in milliseconds added to the start and end of each detected segment. Prevents clipping speech edges. When padding would cause adjacent segments to overlap, the gap is split at the midpoint instead. |
Server Events:
session.created: Initial session confirmation (sent first)
{
"type": "session.created",
"session": {
"id": "session-id",
"object": "realtime.session",
"modalities": ["audio"],
"model": "openai/whisper-large-v3"
}
}
transcription_session.updated: Confirms session configuration was applied. Sent in response to a client transcription_session.updated message.
{
"type": "transcription_session.updated",
"session": {
"turn_detection": {
"type": "server_vad",
"threshold": 0.3,
"min_silence_duration_ms": 500,
"min_speech_duration_ms": 250,
"max_speech_duration_s": 5.0,
"speech_pad_ms": 250
}
}
}
conversation.item.input_audio_transcription.delta: Partial transcription results
{
"type": "conversation.item.input_audio_transcription.delta",
"delta": "The quick brown"
}
conversation.item.input_audio_transcription.completed: Final transcription
{
"type": "conversation.item.input_audio_transcription.completed",
"transcript": "The quick brown fox jumps over the lazy dog"
}
conversation.item.input_audio_transcription.failed: Error occurred
{
"type": "conversation.item.input_audio_transcription.failed",
"error": {
"message": "Error description",
"type": "invalid_request_error",
"param": null,
"code": "invalid_api_key"
}
}
Error Codes:
invalid_api_key: Invalid API key provided (401)missing_api_key: Authorization header missing (401)model_not_available: Invalid or unavailable model (400)import asyncio
import websockets
import json
import base64
import os
async def transcribe_audio():
api_key = os.environ.get("TOGETHER_API_KEY")
url = "wss://api.together.ai/v1/realtime?model=openai/whisper-large-v3&input_audio_format=pcm_s16le_16000"
headers = {
"Authorization": f"Bearer {api_key}"
}
async with websockets.connect(url, additional_headers=headers) as ws:
# Read audio file
with open("audio.wav", "rb") as f:
audio_data = f.read()
# Send audio in chunks with delay to simulate real-time
chunk_size = 8192
bytes_per_second = 16000 * 2 # 16kHz * 2 bytes (16-bit)
delay_per_chunk = chunk_size / bytes_per_second
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i:i+chunk_size]
base64_chunk = base64.b64encode(chunk).decode('utf-8')
await ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": base64_chunk
}))
# Simulate real-time streaming
if i + chunk_size < len(audio_data):
await asyncio.sleep(delay_per_chunk)
# Commit the audio buffer
await ws.send(json.dumps({
"type": "input_audio_buffer.commit"
}))
# Receive transcription results
async for message in ws:
data = json.loads(message)
if data["type"] == "conversation.item.input_audio_transcription.delta":
print(f"Partial: {data['delta']}")
elif data["type"] == "conversation.item.input_audio_transcription.completed":
print(f"Final: {data['transcript']}")
break
elif data["type"] == "conversation.item.input_audio_transcription.failed":
error = data.get("error", {})
print(f"Error: {error.get('message')}")
break
asyncio.run(transcribe_audio())Bearer authentication header of the form Bearer <token>, where <token> is your auth token.
The Whisper model to use for transcription
Audio format specification. Currently supports 16-bit PCM at 16kHz sample rate.
pcm_s16le_16000 Switching Protocols - WebSocket connection established successfully.
Error message format:
{
"type": "conversation.item.input_audio_transcription.failed",
"error": {
"message": "Error description",
"type": "invalid_request_error",
"param": null,
"code": "error_code"
}
}Was this page helpful?