> ## Documentation Index
> Fetch the complete documentation index at: https://docs.together.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Create audio generation request

> Generate audio from input text


## OpenAPI

````yaml POST /audio/speech
openapi: 3.1.0
info:
  title: Together APIs
  description: The Together REST API. See https://docs.together.ai for more details.
  version: 2.0.0
  termsOfService: https://www.together.ai/terms-of-service
  contact:
    name: Together Support
    url: https://www.together.ai/contact
  license:
    name: MIT
    url: https://github.com/togethercomputer/openapi/blob/main/LICENSE
servers:
  - url: https://api.together.ai/v1
security:
  - bearerAuth: []
paths:
  /audio/speech:
    post:
      tags:
        - Audio
      summary: Create audio generation request
      description: Generate audio from input text
      operationId: audio-speech
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/AudioSpeechRequest'
      responses:
        '200':
          description: OK
          content:
            application/octet-stream:
              schema:
                type: string
                format: binary
            audio/wav:
              schema:
                type: string
                format: binary
            audio/mpeg:
              schema:
                type: string
                format: binary
            text/event-stream:
              schema:
                $ref: '#/components/schemas/AudioSpeechStreamResponse'
        '400':
          description: BadRequest
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorData'
        '429':
          description: RateLimit
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorData'
      x-codeSamples:
        - lang: Python
          label: Together AI SDK (v2)
          source: |
            # Docs for v1 can be found by changing the above selector ^
            from together import Together
            import os

            client = Together(
                api_key=os.environ.get("TOGETHER_API_KEY"),
            )

            response = client.audio.speech.with_streaming_response.create(
                model="cartesia/sonic-2",
                input="The quick brown fox jumps over the lazy dog.",
                voice="laidback woman",
            )

            with response as stream:
              stream.stream_to_file("audio.wav")
        - lang: Python
          label: Together AI SDK (v1)
          source: |
            from together import Together
            import os

            client = Together(
                api_key=os.environ.get("TOGETHER_API_KEY"),
            )

            response = client.audio.speech.create(
                model="cartesia/sonic-2",
                input="The quick brown fox jumps over the lazy dog.",
                voice="laidback woman",
            )

            response.stream_to_file("audio.wav")
        - lang: TypeScript
          label: Together AI SDK (TypeScript)
          source: |
            import Together from "together-ai";
            import { createWriteStream } from "fs";
            import { join } from "path";
            import { pipeline } from "stream/promises";

            const client = new Together({
              apiKey: process.env.TOGETHER_API_KEY,
            });

            const response = await client.audio.speech.create({
              model: "cartesia/sonic-2",
              input: "The quick brown fox jumps over the lazy dog.",
              voice: "laidback woman",
            });

            const filepath = join(process.cwd(), "audio.wav");
            const writeStream = createWriteStream(filepath);

            if (response.body) {
              await pipeline(response.body, writeStream);
            }
        - lang: JavaScript
          label: Together AI SDK (JavaScript)
          source: |
            import Together from "together-ai";
            import { createWriteStream } from "fs";
            import { join } from "path";
            import { pipeline } from "stream/promises";

            const client = new Together({
              apiKey: process.env.TOGETHER_API_KEY,
            });

            const response = await client.audio.speech.create({
              model: "cartesia/sonic-2",
              input: "The quick brown fox jumps over the lazy dog.",
              voice: "laidback woman",
            });

            const filepath = join(process.cwd(), "audio.wav");
            const writeStream = createWriteStream(filepath);

            if (response.body) {
              await pipeline(response.body, writeStream);
            }
        - lang: Shell
          label: cURL
          source: |
            curl -X POST "https://api.together.ai/v1/audio/speech" \
                 -H "Authorization: Bearer $TOGETHER_API_KEY" \
                 -H "Content-Type: application/json" \
                 -d '{
                   "model": "cartesia/sonic-2",
                   "input": "The quick brown fox jumps over the lazy dog.",
                   "voice": "laidback woman"
                 }' \
                 --output audio.wav
components:
  schemas:
    AudioSpeechRequest:
      type: object
      required:
        - model
        - input
        - voice
      properties:
        model:
          description: >
            The name of the model to query.<br> <br> [See all of Together AI's
            chat
            models](https://docs.together.ai/docs/serverless-models#audio-models)
            The current supported tts models are: - cartesia/sonic -
            hexgrad/Kokoro-82M - canopylabs/orpheus-3b-0.1-ft
          example: canopylabs/orpheus-3b-0.1-ft
          anyOf:
            - type: string
              enum:
                - cartesia/sonic
                - hexgrad/Kokoro-82M
                - canopylabs/orpheus-3b-0.1-ft
            - type: string
        input:
          type: string
          description: Input text to generate the audio for
        voice:
          description: >
            The voice to use for generating the audio. The voices supported are
            different for each model. For eg - for canopylabs/orpheus-3b-0.1-ft,
            one of the voices supported is tara, for hexgrad/Kokoro-82M, one of
            the voices supported is af_alloy and for cartesia/sonic, one of the
            voices supported is "friendly sidekick". <br> <br> You can view the
            voices supported for each model using the /v1/voices endpoint
            sending the model name as the query parameter. [View all supported
            voices
            here](https://docs.together.ai/docs/text-to-speech#supported-voices).
            <br> <br> `hexgrad/Kokoro-82M` additionally supports voice mixing,
            where two or more voices are combined into a single blended voice by
            joining their names with `+` (e.g. `af_bella+af_heart`). Optional
            per-voice weights can be provided in parentheses (e.g.
            `af_bella(2)+af_heart(1)`). Other models require a single voice
            name.
          type: string
        response_format:
          type: string
          description: >-
            The format of audio output. Supported formats are mp3, wav, raw if
            streaming is false. If streaming is true, the only supported format
            is raw.
          default: wav
          enum:
            - mp3
            - wav
            - raw
        language:
          type: string
          description: >
            Language or locale of input text. Accepts ISO 639-1 language codes
            (e.g., `en`, `fr`, `es`, `zh`) as well as locale codes for
            region-specific variants. Locale codes must be lowercase (e.g.,
            `zh-hk` for Cantonese).
          default: en
          example: en
        response_encoding:
          type: string
          description: >-
            Audio encoding of response. Only applicable when response_format is
            raw or pcm. Cartesia models respect this parameter and support all
            values. Orpheus, Kokoro, and Minimax models always return pcm_s16le
            regardless of this setting.
          default: pcm_f32le
          enum:
            - pcm_f32le
            - pcm_s16le
            - pcm_mulaw
            - pcm_alaw
        sample_rate:
          type: integer
          default: 44100
          description: >-
            Sampling rate in Hz for the output audio. Cartesia and Minimax
            models respect this parameter. Orpheus and Kokoro models always
            output at 24000 Hz regardless of this setting.
        bit_rate:
          type: integer
          description: >-
            Bitrate of the MP3 audio output in bits per second. Only applicable
            when response_format is mp3. Higher values produce better audio
            quality at larger file sizes. Default is 128000. Currently supported
            on Cartesia models.
          default: 128000
          enum:
            - 32000
            - 64000
            - 96000
            - 128000
            - 192000
        stream:
          type: boolean
          default: false
          description: >-
            If true, output is streamed for several characters at a time instead
            of waiting for the full response. The stream terminates with `data:
            [DONE]`. If false, return the encoded audio as octet stream
        extra_params:
          type: object
          description: >-
            Additional model-specific parameters that fine-tune speech
            generation behavior.
          properties:
            pronunciation_dict:
              type: array
              items:
                type: string
              description: >-
                A list of pronunciation rules for specific characters or
                symbols. Each entry uses the format `"<source>/<replacement>"`
                (e.g., `["omg/oh my god"]`) to override how the model pronounces
                matching tokens.
              example:
                - omg/oh my god
    AudioSpeechStreamResponse:
      oneOf:
        - $ref: '#/components/schemas/AudioSpeechStreamEvent'
        - $ref: '#/components/schemas/StreamSentinel'
    ErrorData:
      type: object
      required:
        - error
      properties:
        error:
          type: object
          properties:
            message:
              type: string
              nullable: false
            type:
              type: string
              nullable: false
            param:
              type: string
              nullable: true
              default: null
            code:
              type: string
              nullable: true
              default: null
          required:
            - type
            - message
    AudioSpeechStreamEvent:
      type: object
      required:
        - data
      properties:
        data:
          $ref: '#/components/schemas/AudioSpeechStreamChunk'
    StreamSentinel:
      type: object
      required:
        - data
      properties:
        data:
          title: stream_signal
          type: string
          enum:
            - '[DONE]'
    AudioSpeechStreamChunk:
      type: object
      required:
        - object
        - model
        - b64
      properties:
        object:
          description: The object type, which is always `audio.tts.chunk`.
          const: audio.tts.chunk
        model:
          type: string
          example: cartesia/sonic
        b64:
          type: string
          description: base64 encoded audio stream
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      x-bearer-format: bearer
      x-default: default

````