Skip to main content
GET
/
audio
/
speech
/
websocket
Python WebSocket Client
import asyncio
import websockets
import json
import base64
import os

async def generate_speech():
    api_key = os.environ.get("TOGETHER_API_KEY")
    url = "wss://api.together.ai/v1/audio/speech/websocket?model=hexgrad/Kokoro-82M&voice=tara"

    headers = {
        "Authorization": f"Bearer {api_key}"
    }

    async with websockets.connect(url, additional_headers=headers) as ws:
        # Wait for session created
        session_msg = await ws.recv()
        session_data = json.loads(session_msg)
        print(f"Session created: {session_data['session']['id']}")

        # Send text for TTS
        text_chunks = [
            "Hello, this is a test.",
            "This is the second sentence.",
            "And this is the final one."
        ]

        async def send_text():
            for chunk in text_chunks:
                await ws.send(json.dumps({
                    "type": "input_text_buffer.append",
                    "text": chunk
                }))
                await asyncio.sleep(0.5)  # Simulate typing

            # Commit to process any remaining text
            await ws.send(json.dumps({
                "type": "input_text_buffer.commit"
            }))

        async def receive_audio():
            audio_data = bytearray()
            async for message in ws:
                data = json.loads(message)
                
                if data["type"] == "conversation.item.input_text.received":
                    print(f"Text received: {data['text']}")
                elif data["type"] == "conversation.item.audio_output.delta":
                    # Decode base64 audio chunk
                    audio_chunk = base64.b64decode(data['delta'])
                    audio_data.extend(audio_chunk)
                    print(f"Received audio chunk for item {data['item_id']}")
                elif data["type"] == "conversation.item.audio_output.done":
                    print(f"Audio generation complete for item {data['item_id']}")
                elif data["type"] == "conversation.item.tts.failed":
                    error = data.get("error", {})
                    print(f"Error: {error.get('message')}")
                    break

            # Save the audio to a file
            with open("output.wav", "wb") as f:
                f.write(audio_data)
            print("Audio saved to output.wav")

        # Run send and receive concurrently
        await asyncio.gather(send_text(), receive_audio())

asyncio.run(generate_speech())

Authorizations

Authorization
string
header
default:default
required

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Query Parameters

model
enum<string>
default:hexgrad/Kokoro-82M

The TTS model to use for speech generation. Can also be set via tts_session.updated event.

Available options:
hexgrad/Kokoro-82M,
cartesia/sonic-english
voice
string
default:tara

The voice to use for speech generation. Default is 'tara'. Available voices vary by model. Can also be updated via tts_session.updated event.

max_partial_length
integer
default:250

Maximum number of characters in partial text before forcing TTS generation even without a sentence ending. Helps reduce latency for long text without punctuation.

Response

101

Switching Protocols - WebSocket connection established successfully.

Error message format:

{
"type": "conversation.item.tts.failed",
"error": {
"message": "Error description",
"type": "invalid_request_error",
"param": null,
"code": "error_code"
}
}