> ## Documentation Index
> Fetch the complete documentation index at: https://docs.together.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Create a dedicated endpoint

> Creates a new dedicated endpoint for serving models. The endpoint starts automatically after creation. You can deploy any supported model on hardware configurations that meet the model's requirements.



## OpenAPI

````yaml POST /endpoints
openapi: 3.1.0
info:
  title: Together APIs
  description: The Together REST API. See https://docs.together.ai for more details.
  version: 2.0.0
  termsOfService: https://www.together.ai/terms-of-service
  contact:
    name: Together Support
    url: https://www.together.ai/contact
  license:
    name: MIT
    url: https://github.com/togethercomputer/openapi/blob/main/LICENSE
servers:
  - url: https://api.together.ai/v1
security:
  - bearerAuth: []
paths:
  /endpoints:
    post:
      tags:
        - Endpoints
      summary: Create a dedicated endpoint
      description: >-
        Creates a new dedicated endpoint for serving models. The endpoint starts
        automatically after creation. You can deploy any supported model on
        hardware configurations that meet the model's requirements.
      operationId: createEndpoint
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateEndpointRequest'
      responses:
        '200':
          description: '200'
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/DedicatedEndpoint'
        '403':
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorData'
        '500':
          description: Internal error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorData'
      x-codeSamples:
        - lang: Python
          label: Together AI SDK (v2)
          source: |
            # Docs for v1 can be found by changing the above selector ^
            from together import Together
            import os

            client = Together(
                api_key=os.environ.get("TOGETHER_API_KEY"),
            )

            endpoint = client.endpoints.create(
                model="Qwen/Qwen3.5-9B-FP8",
                hardware="1x_nvidia_a100_80gb_sxm",
                autoscaling={
                  "min_replicas": 2,
                  "max_replicas": 5,
                }
            )

            print(endpoint.id)
        - lang: Python
          label: Together AI SDK (v1)
          source: |
            from together import Together
            import os

            client = Together(
                api_key=os.environ.get("TOGETHER_API_KEY"),
            )

            endpoint = client.endpoints.create(
                model="Qwen/Qwen3.5-9B-FP8",
                hardware="1x_nvidia_a100_80gb_sxm",
                min_replicas=2,
                max_replicas=5,
            )

            print(endpoint.id)
        - lang: TypeScript
          label: Together AI SDK (TypeScript)
          source: |
            import Together from "together-ai";

            const client = new Together({
              apiKey: process.env.TOGETHER_API_KEY,
            });

            const endpoint = await client.endpoints.create({
              model: "Qwen/Qwen3.5-9B-FP8",
              hardware: "1x_nvidia_a100_80gb_sxm",
              autoscaling: {
                max_replicas: 5,
                min_replicas: 2,
              }
            });

            console.log(endpoint.id);
        - lang: JavaScript
          label: Together AI SDK (JavaScript)
          source: |
            import Together from "together-ai";

            const client = new Together({
              apiKey: process.env.TOGETHER_API_KEY,
            });

            const endpoint = await client.endpoints.create({
              model: "Qwen/Qwen3.5-9B-FP8",
              hardware: "1x_nvidia_a100_80gb_sxm",
              autoscaling: {
                max_replicas: 5,
                min_replicas: 2,
              }
            });

            console.log(endpoint.id);
        - lang: Shell
          label: cURL
          source: |
            curl -X POST "https://api.together.ai/v1/endpoints" \
                 -H "Authorization: Bearer $TOGETHER_API_KEY" \
                 -H "Content-Type: application/json" \
                 -d '{
                   "model": "Qwen/Qwen3.5-9B-FP8",
                   "hardware": "1x_nvidia_a100_80gb_sxm",
                   "autoscaling": {
                     "max_replicas": 5,
                     "min_replicas": 2
                   }
                 }'
components:
  schemas:
    CreateEndpointRequest:
      type: object
      required:
        - model
        - hardware
        - autoscaling
      properties:
        display_name:
          type: string
          description: A human-readable name for the endpoint
          example: My Llama3 70b endpoint
        model:
          type: string
          description: The model to deploy on this endpoint
          example: deepseek-ai/DeepSeek-R1
        hardware:
          type: string
          description: The hardware configuration to use for this endpoint
          example: 1x_nvidia_a100_80gb_sxm
        autoscaling:
          $ref: '#/components/schemas/Autoscaling'
          description: Configuration for automatic scaling of the endpoint
        disable_prompt_cache:
          deprecated: true
          type: boolean
          description: This parameter is deprecated and no longer has any effect.
          default: false
        disable_speculative_decoding:
          type: boolean
          description: Whether to disable speculative decoding for this endpoint
          default: false
        state:
          type: string
          description: The desired state of the endpoint
          enum:
            - STARTED
            - STOPPED
          default: STARTED
          example: STARTED
        inactive_timeout:
          type: integer
          description: >-
            The number of minutes of inactivity after which the endpoint stops
            automatically. Set to null, omit, or set to 0 to disable automatic
            timeout.
          nullable: true
          example: 60
        availability_zone:
          type: string
          description: >-
            Create the endpoint in a specified availability zone (e.g.,
            us-central-4b)
    DedicatedEndpoint:
      type: object
      description: Details about a dedicated endpoint deployment
      required:
        - object
        - id
        - name
        - display_name
        - model
        - hardware
        - type
        - owner
        - state
        - autoscaling
        - created_at
      properties:
        object:
          description: The object type, which is always `endpoint`.
          const: endpoint
        id:
          type: string
          description: Unique identifier for the endpoint
          example: endpoint-d23901de-ef8f-44bf-b3e7-de9c1ca8f2d7
        name:
          type: string
          description: System name for the endpoint
          example: devuser/deepseek-ai/DeepSeek-R1-a32b82a1
        display_name:
          type: string
          description: Human-readable name for the endpoint
          example: My DeepSeek R1 endpoint
        model:
          type: string
          description: The model deployed on this endpoint
          example: deepseek-ai/DeepSeek-R1
        hardware:
          type: string
          description: The hardware configuration used for this endpoint
          example: 8x_nvidia_h200_140gb_sxm
        type:
          type: string
          enum:
            - dedicated
          description: The type of endpoint
          example: dedicated
        owner:
          type: string
          description: The owner of this endpoint
          example: devuser
        state:
          type: string
          enum:
            - PENDING
            - STARTING
            - STARTED
            - STOPPING
            - STOPPED
            - ERROR
          description: Current state of the endpoint
          example: STARTED
        autoscaling:
          $ref: '#/components/schemas/Autoscaling'
          description: Configuration for automatic scaling of the endpoint
        created_at:
          type: string
          format: date-time
          description: Timestamp when the endpoint was created
          example: '2025-02-04T10:43:55.405Z'
    ErrorData:
      type: object
      required:
        - error
      properties:
        error:
          type: object
          properties:
            message:
              type: string
              nullable: false
            type:
              type: string
              nullable: false
            param:
              type: string
              nullable: true
              default: null
            code:
              type: string
              nullable: true
              default: null
          required:
            - type
            - message
    Autoscaling:
      type: object
      description: Configuration for automatic scaling of replicas based on demand.
      required:
        - min_replicas
        - max_replicas
      properties:
        min_replicas:
          type: integer
          format: int32
          description: >-
            The minimum number of replicas to maintain, even when there is no
            load
          examples:
            - 2
        max_replicas:
          type: integer
          format: int32
          description: The maximum number of replicas to scale up to under load
          examples:
            - 5
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      x-bearer-format: bearer
      x-default: default

````