> ## Documentation Index
> Fetch the complete documentation index at: https://docs.together.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Create audio transcription request

> Transcribes audio into text


## OpenAPI

````yaml POST /audio/transcriptions
openapi: 3.1.0
info:
  title: Together APIs
  description: The Together REST API. See https://docs.together.ai for more details.
  version: 2.0.0
  termsOfService: https://www.together.ai/terms-of-service
  contact:
    name: Together Support
    url: https://www.together.ai/contact
  license:
    name: MIT
    url: https://github.com/togethercomputer/openapi/blob/main/LICENSE
servers:
  - url: https://api.together.ai/v1
    description: Default environment for APIs
  - url: https://api-inference.together.ai/v2
    description: Optimized environment for inference
security:
  - bearerAuth: []
paths:
  /audio/transcriptions:
    post:
      tags:
        - Audio
      summary: Create audio transcription request
      description: Transcribes audio into text
      operationId: audio-transcriptions
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              $ref: '#/components/schemas/AudioTranscriptionRequest'
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/AudioTranscriptionResponse'
        '400':
          description: >-
            BadRequest. Possible error codes include `audio_too_long` (audio
            duration exceeds the 4 hour cap), `file_too_large` (URL-fetched
            audio exceeds the 1 GB server-side cap), `unsupported_format` (codec
            or container could not be decoded), and `invalid_params` (request
            parameters failed validation).
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorData'
        '401':
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorData'
        '413':
          description: >-
            Payload Too Large. The request body exceeded the 500 MB edge limit.
            For larger payloads, host the file and submit an HTTPS URL via the
            `file` field (URL-fetched audio is capped at 1 GB server-side).
          content:
            text/html:
              schema:
                type: string
        '429':
          description: RateLimit
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorData'
      x-codeSamples:
        - lang: Python
          label: Together AI SDK (v2)
          source: |
            # Docs for v1 can be found by changing the above selector ^
            from together import Together

            client = Together(
                api_key=os.environ.get("TOGETHER_API_KEY"),
            )

            file = open("audio.wav", "rb")

            response = client.audio.transcriptions.create(
                model="openai/whisper-large-v3",
                file=file,
            )

            print(response.text)
        - lang: Python
          label: Together AI SDK (v1)
          source: |
            from together import Together

            client = Together(
                api_key=os.environ.get("TOGETHER_API_KEY"),
            )

            file = open("audio.wav", "rb")

            response = client.audio.transcriptions.create(
                model="openai/whisper-large-v3",
                file=file,
            )

            print(response.text)
        - lang: TypeScript
          label: Together AI SDK (TypeScript)
          source: >
            import Together from "together-ai";

            import { readFileSync } from "fs";

            import { join } from "path";


            const client = new Together({
              apiKey: process.env.TOGETHER_API_KEY,
            });


            const audioFilePath = join(process.cwd(), "audio.wav");

            const audioBuffer = readFileSync(audioFilePath);

            const audioFile = new File([audioBuffer], "audio.wav", { type:
            "audio/wav" });


            const response = await client.audio.transcriptions.create({
              model: "openai/whisper-large-v3",
              file: audioFile,
            });


            console.log(response.text);
        - lang: JavaScript
          label: Together AI SDK (JavaScript)
          source: >
            import Together from "together-ai";

            import { readFileSync } from "fs";

            import { join } from "path";


            const client = new Together({
              apiKey: process.env.TOGETHER_API_KEY,
            });


            const audioFilePath = join(process.cwd(), "audio.wav");

            const audioBuffer = readFileSync(audioFilePath);

            const audioFile = new File([audioBuffer], "audio.wav", { type:
            "audio/wav" });


            const response = await client.audio.transcriptions.create({
              model: "openai/whisper-large-v3",
              file: audioFile,
            });


            console.log(response.text);
        - lang: Shell
          label: cURL
          source: |
            curl -X POST "https://api.together.ai/v1/audio/transcriptions" \
                 -H "Authorization: Bearer $TOGETHER_API_KEY" \
                 -F "file=@audio.wav" \
                 -F "model=openai/whisper-large-v3"
components:
  schemas:
    AudioTranscriptionRequest:
      type: object
      required:
        - file
      properties:
        file:
          oneOf:
            - $ref: '#/components/schemas/AudioFileBinary'
            - $ref: '#/components/schemas/AudioFileUrl'
          description: >-
            Audio file upload or public HTTP/HTTPS URL. Supported formats: .wav,
            .mp3, .m4a, .webm, .flac, .ogg, .opus, .aac. Maximum duration 4
            hours; longer audio is rejected with `audio_too_long`. Binary
            uploads are additionally capped at 500 MB (HTTP 413); URL-fetched
            audio is capped at 1 GB.
        model:
          type: string
          description: Model to use for transcription
          default: openai/whisper-large-v3
          enum:
            - openai/whisper-large-v3
        language:
          type: string
          description: >-
            Optional ISO 639-1 language code. If `auto` is provided, language is
            auto-detected.
          default: en
          example: en
        prompt:
          type: string
          description: >-
            Optional text to bias decoding. Supported only on Whisper-family
            models (e.g. `openai/whisper-large-v3`). Other STT models (e.g.
            `nvidia/parakeet-tdt-0.6b-v3`) accept the field for API
            compatibility but ignore it.
        response_format:
          type: string
          description: The format of the response
          default: json
          enum:
            - json
            - verbose_json
        temperature:
          type: number
          description: Sampling temperature between 0.0 and 1.0
          default: 0
          minimum: 0
          maximum: 1
        timestamp_granularities:
          oneOf:
            - type: string
              enum:
                - segment
                - word
            - type: array
              items:
                type: string
                enum:
                  - segment
                  - word
              uniqueItems: true
              minItems: 1
              maxItems: 2
          description: >-
            Controls level of timestamp detail in verbose_json. Only used when
            response_format is verbose_json. Can be a single granularity or an
            array to get multiple levels.
          default: segment
          example:
            - word
            - segment
        diarize:
          type: boolean
          description: >
            Whether to enable speaker diarization. When enabled, you will get
            the speaker id for each word in the transcription. In the response,
            in the words array, you will get the speaker id for each word. In
            addition, we also return the speaker_segments array which contains
            the speaker id for each speaker segment along with the start and end
            time of the segment along with all the words in the segment. <br>
            <br> For eg - ... "speaker_segments": [
              "speaker_id": "SPEAKER_00",
              "start": 0,
              "end": 30.02,
              "words": [
                {
                  "id": 0,
                  "word": "Tijana",
                  "start": 0,
                  "end": 11.475,
                  "speaker_id": "SPEAKER_00"
                },
                ...
          default: false
        min_speakers:
          type: integer
          description: >-
            Minimum number of speakers expected in the audio. Used to improve
            diarization accuracy when the approximate number of speakers is
            known.
        max_speakers:
          type: integer
          description: >-
            Maximum number of speakers expected in the audio. Used to improve
            diarization accuracy when the approximate number of speakers is
            known.
    AudioTranscriptionResponse:
      oneOf:
        - $ref: '#/components/schemas/AudioTranscriptionJsonResponse'
        - $ref: '#/components/schemas/AudioTranscriptionVerboseJsonResponse'
    ErrorData:
      type: object
      required:
        - error
      properties:
        error:
          type: object
          properties:
            message:
              type: string
              nullable: false
            type:
              type: string
              nullable: false
            param:
              type: string
              nullable: true
              default: null
            code:
              type: string
              nullable: true
              default: null
          required:
            - type
            - message
    AudioFileBinary:
      type: string
      format: binary
      description: >-
        Audio file uploaded as binary. Maximum upload size 500 MB (requests
        above this are rejected at the edge with HTTP 413). For larger payloads,
        host the file and submit a URL instead.
    AudioFileUrl:
      type: string
      format: uri
      description: >-
        Public HTTPS URL the server fetches the audio from. Server-side fetch
        limit is 1 GB; downloads exceeding the cap fail with `file_too_large`.
    AudioTranscriptionJsonResponse:
      type: object
      required:
        - text
      properties:
        text:
          type: string
          description: The transcribed text
          example: Hello, world!
    AudioTranscriptionVerboseJsonResponse:
      type: object
      required:
        - language
        - duration
        - text
        - segments
      properties:
        language:
          type: string
          description: The language of the audio
          example: english
        duration:
          type: number
          description: The duration of the audio in seconds
          example: 3.5
        text:
          type: string
          description: The transcribed text
          example: Hello, world!
        segments:
          type: array
          items:
            $ref: '#/components/schemas/AudioTranscriptionSegment'
          description: Array of transcription segments
        words:
          type: array
          items:
            $ref: '#/components/schemas/AudioTranscriptionWord'
          description: >-
            Array of transcription words (only when timestamp_granularities
            includes 'word')
        speaker_segments:
          type: array
          items:
            $ref: '#/components/schemas/AudioTranscriptionSpeakerSegment'
          description: >-
            Array of transcription speaker segments (only when diarize is
            enabled)
    AudioTranscriptionSegment:
      type: object
      required:
        - id
        - start
        - end
        - text
      properties:
        id:
          type: integer
          description: Unique identifier for the segment
          example: 0
        start:
          type: number
          description: Start time of the segment in seconds
          example: 0
        end:
          type: number
          description: End time of the segment in seconds
          example: 3.5
        text:
          type: string
          description: The text content of the segment
          example: Hello, world!
    AudioTranscriptionWord:
      type: object
      required:
        - word
        - start
        - end
      properties:
        word:
          type: string
          description: The word
          example: Hello
        start:
          type: number
          description: Start time of the word in seconds
          example: 0
        end:
          type: number
          description: End time of the word in seconds
          example: 0.5
        speaker_id:
          type: string
          description: The speaker id for the word (only when diarize is enabled)
          example: SPEAKER_00
    AudioTranscriptionSpeakerSegment:
      type: object
      required:
        - speaker_id
        - start
        - end
        - words
        - text
        - id
      properties:
        speaker_id:
          type: string
          description: The speaker identifier
          example: SPEAKER_00
        start:
          type: number
          description: Start time of the speaker segment in seconds
          example: 0
        end:
          type: number
          description: End time of the speaker segment in seconds
          example: 30.02
        words:
          type: array
          items:
            $ref: '#/components/schemas/AudioTranscriptionWord'
          description: Array of words spoken by this speaker in this segment
        text:
          type: string
          description: The full text spoken by this speaker in this segment
          example: Hello, how are you doing today?
        id:
          type: integer
          description: Unique identifier for the speaker segment
          example: 1
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      x-bearer-format: bearer
      x-default: default

````