import asyncio
import base64
import json
import os
import websockets
API_KEY = os.environ["TOGETHER_API_KEY"]
MODEL = "openai/whisper-large-v3"
VAD_CONFIG = {
"type": "server_vad",
"threshold": 0.3,
"min_silence_duration_ms": 500,
"min_speech_duration_ms": 250,
"max_speech_duration_s": 5.0,
"speech_pad_ms": 250,
}
async def transcribe():
url = f"wss://api.together.ai/v1/realtime?model={MODEL}&input_audio_format=pcm_s16le_16000"
headers = {"Authorization": f"Bearer {API_KEY}"}
async with websockets.connect(url, additional_headers=headers) as ws:
# Wait for session.created, then send VAD config
msg = json.loads(await ws.recv())
if msg["type"] == "session.created":
await ws.send(
json.dumps(
{
"type": "transcription_session.updated",
"session": {"turn_detection": VAD_CONFIG},
}
)
)
# Send audio in 100ms chunks at real-time pace
with open("audio.wav", "rb") as f:
audio = f.read()
CHUNK = 3200 # 100ms at 16kHz 16-bit
for i in range(0, len(audio), CHUNK):
await ws.send(
json.dumps(
{
"type": "input_audio_buffer.append",
"audio": base64.b64encode(
audio[i : i + CHUNK]
).decode(),
}
)
)
await asyncio.sleep(0.1)
await ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
# Receive transcription results
async for message in ws:
data = json.loads(message)
if (
data["type"]
== "conversation.item.input_audio_transcription.completed"
):
print(data["transcript"])
elif (
data["type"]
== "conversation.item.input_audio_transcription.failed"
):
print(f"Error: {data['error']['message']}")
break
asyncio.run(transcribe())