> ## Documentation Index
> Fetch the complete documentation index at: https://docs.together.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Create an evaluation job


## OpenAPI

````yaml POST /evaluation
openapi: 3.1.0
info:
  title: Together APIs
  description: The Together REST API. See https://docs.together.ai for more details.
  version: 2.0.0
  termsOfService: https://www.together.ai/terms-of-service
  contact:
    name: Together Support
    url: https://www.together.ai/contact
  license:
    name: MIT
    url: https://github.com/togethercomputer/openapi/blob/main/LICENSE
servers:
  - url: https://api.together.ai/v1
    description: Default environment for APIs
  - url: https://api-inference.together.ai/v2
    description: Optimized environment for inference
security:
  - bearerAuth: []
paths:
  /evaluation:
    post:
      tags:
        - evaluation
      summary: Create an evaluation job
      operationId: createEvaluationJob
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/EvaluationTypedRequest'
      responses:
        '200':
          description: Evaluation job created successfully
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvaluationResponse'
        '400':
          description: Invalid request format
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorData'
        '500':
          description: Failed to create evaluation job
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorData'
      x-codeSamples:
        - lang: Python
          label: Together AI SDK (v2)
          source: |
            # Docs for v1 can be found by changing the above selector ^
            from together import Together
            import os

            client = Together(
                api_key=os.environ.get("TOGETHER_API_KEY"),
            )

            response = client.evals.create(
                type="classify",
                parameters=ParametersEvaluationClassifyParameters(
                    judge=ParametersEvaluationClassifyParametersJudge(
                        model="openai/gpt-oss-120b",
                        model_source="serverless",
                        system_template="You are an expert evaluator...",
                    ),
                    input_data_file_path="file-abc123",
                    labels=["good", "bad"],
                    pass_labels=["good"],
                    model_to_evaluate="Qwen/Qwen3.5-9B"
                )
            )

            print(response.workflow_id)
        - lang: Python
          label: Together AI SDK (v1)
          source: |
            from together import Together
            import os

            client = Together(
                api_key=os.environ.get("TOGETHER_API_KEY"),
            )

            response = client.evaluation.create(
                type="classify",
                judge_model_name="openai/gpt-oss-120b",
                judge_system_template="You are an expert evaluator...",
                input_data_file_path="file-abc123",
                labels=["good", "bad"],
                pass_labels=["good"],
                model_to_evaluate="Qwen/Qwen3.5-9B"
            )

            print(response.workflow_id)
        - lang: TypeScript
          label: Together AI SDK (TypeScript)
          source: |
            import Together from "together-ai";

            const client = new Together({
              apiKey: process.env.TOGETHER_API_KEY,
            });

            const response = await client.evals.create({
              type: 'classify',
              parameters: {
                judge: {
                  model: 'openai/gpt-oss-120b',
                  model_source: 'serverless',
                  system_template: 'You are an expert evaluator...',
                },
                input_data_file_path: 'file-abc123',
                labels: ['good', 'bad'],
                pass_labels: ['good'],
                model_to_evaluate: 'Qwen/Qwen3.5-9B',
              },
            });

            console.log(response.workflow_id);
        - lang: JavaScript
          label: Together AI SDK (JavaScript)
          source: |
            import Together from "together-ai";

            const client = new Together({
              apiKey: process.env.TOGETHER_API_KEY,
            });

            const response = await client.evals.create({
              type: 'classify',
              parameters: {
                judge: {
                  model: 'openai/gpt-oss-120b',
                  model_source: 'serverless',
                  system_template: 'You are an expert evaluator...',
                },
                input_data_file_path: 'file-abc123',
                labels: ['good', 'bad'],
                pass_labels: ['good'],
                model_to_evaluate: 'Qwen/Qwen3.5-9B',
              },
            });

            console.log(response.workflow_id);
components:
  schemas:
    EvaluationTypedRequest:
      type: object
      required:
        - type
        - parameters
      properties:
        type:
          type: string
          enum:
            - classify
            - score
            - compare
          description: The type of evaluation to perform
          example: classify
        parameters:
          oneOf:
            - $ref: '#/components/schemas/EvaluationClassifyParameters'
            - $ref: '#/components/schemas/EvaluationScoreParameters'
            - $ref: '#/components/schemas/EvaluationCompareParameters'
          description: Type-specific parameters for the evaluation
    EvaluationResponse:
      type: object
      properties:
        workflow_id:
          type: string
          description: The ID of the created evaluation job
          example: eval-1234-1244513
        status:
          type: string
          enum:
            - pending
          description: Initial status of the job
    ErrorData:
      type: object
      required:
        - error
      properties:
        error:
          type: object
          properties:
            message:
              type: string
              nullable: false
            type:
              type: string
              nullable: false
            param:
              type: string
              nullable: true
              default: null
            code:
              type: string
              nullable: true
              default: null
          required:
            - type
            - message
    EvaluationClassifyParameters:
      type: object
      required:
        - judge
        - labels
        - pass_labels
        - input_data_file_path
      properties:
        judge:
          $ref: '#/components/schemas/EvaluationJudgeModelConfig'
        labels:
          type: array
          items:
            type: string
          minItems: 2
          description: List of possible classification labels
          example:
            - 'yes'
            - 'no'
        pass_labels:
          type: array
          items:
            type: string
          minItems: 1
          description: List of labels that are considered passing
          example:
            - 'yes'
        model_to_evaluate:
          $ref: '#/components/schemas/EvaluationModelOrString'
        input_data_file_path:
          type: string
          description: Data file ID
          example: file-1234-aefd
    EvaluationScoreParameters:
      type: object
      required:
        - judge
        - min_score
        - max_score
        - pass_threshold
        - input_data_file_path
      properties:
        judge:
          $ref: '#/components/schemas/EvaluationJudgeModelConfig'
        min_score:
          type: number
          example: 0
          description: Minimum possible score
        max_score:
          type: number
          example: 10
          description: Maximum possible score
        pass_threshold:
          type: number
          example: 7
          description: Score threshold for passing
        model_to_evaluate:
          $ref: '#/components/schemas/EvaluationModelOrString'
        input_data_file_path:
          type: string
          example: file-01234567890123456789
          description: Data file ID
    EvaluationCompareParameters:
      type: object
      required:
        - judge
        - input_data_file_path
      properties:
        judge:
          $ref: '#/components/schemas/EvaluationJudgeModelConfig'
        model_a:
          description: >
            Either an EvaluationModelRequest for generation or a string column
            name from the dataset (when responses are pre-generated). When both
            model_a and model_b are EvaluationModelRequest objects, their
            inference runs execute in parallel to reduce total wall-clock time.
          oneOf:
            - $ref: '#/components/schemas/EvaluationModelRequest'
            - type: string
              description: Column name in the input data containing pre-generated responses
        model_b:
          description: >
            Either an EvaluationModelRequest for generation or a string column
            name from the dataset (when responses are pre-generated). When both
            model_a and model_b are EvaluationModelRequest objects, their
            inference runs execute in parallel to reduce total wall-clock time.
          oneOf:
            - $ref: '#/components/schemas/EvaluationModelRequest'
            - type: string
              description: Column name in the input data containing pre-generated responses
        input_data_file_path:
          type: string
          example: file-01234567890123456789
          description: Data file ID
        disable_position_bias_correction:
          type: boolean
          default: false
          description: >
            When false (default), the judge runs twice per sample: once with
            model A's response first (original order) and once with model B's
            response first (flipped order). The two verdicts are reconciled to
            cancel out position bias. When true, only the original-order pass is
            run, halving judge cost and latency at the expense of position-bias
            correction. The result file will not contain flipped-order judge
            fields when this is true.
    EvaluationJudgeModelConfig:
      type: object
      required:
        - model
        - system_template
        - model_source
      properties:
        model:
          type: string
          description: Name of the judge model
          example: Qwen/Qwen3.5-9B
        system_template:
          type: string
          description: System prompt template for the judge
          example: Imagine you are a helpful assistant
        model_source:
          type: string
          description: >
            Source of the judge model inference: - `serverless`: Together's
            shared serverless inference API. Default concurrency: 25 workers. -
            `dedicated`: A Together dedicated deployment endpoint. Default
            concurrency: 5 workers
              (minimum enforced even if num_workers is set lower).
            - `external`: An external inference API (e.g. OpenAI, Anthropic,
            Google, OpenRouter).
              Requires `external_api_token` and `external_base_url`. Default concurrency: 2 workers
              for first-party APIs, 20 for proxy/aggregator endpoints.
          enum:
            - serverless
            - dedicated
            - external
        external_api_token:
          type: string
          description: >-
            Bearer/API token for the external judge model provider. Required
            when model_source is 'external'.
        external_base_url:
          type: string
          description: >-
            Base URL of the external inference API for the judge. Must be
            OpenAI-compatible. Required when model_source is 'external'.
        num_workers:
          type: integer
          minimum: 1
          description: >
            Number of concurrent inference workers for the judge. Overrides the
            source-specific default (serverless: 25, dedicated: 5, external:
            2–20). For dedicated endpoints the value is clamped to a minimum of
            5 regardless of what is set here.
          example: 5
        max_tokens:
          type: integer
          minimum: 1
          description: >-
            Maximum number of tokens the judge model may generate. Defaults to
            32768 if omitted. Set higher for reasoning judges (e.g. o-series,
            Gemini) that spend tokens on internal chain-of-thought before
            emitting the verdict JSON.
          example: 8192
        temperature:
          type: number
          minimum: 0
          maximum: 2
          description: >-
            Sampling temperature for the judge model. Defaults to 0.05 if
            omitted.
          example: 0
    EvaluationModelOrString:
      oneOf:
        - type: string
          description: Column name in the input dataset containing pre-generated responses
        - $ref: '#/components/schemas/EvaluationModelRequest'
    EvaluationModelRequest:
      type: object
      required:
        - model
        - max_tokens
        - temperature
        - system_template
        - input_template
        - model_source
      properties:
        model:
          type: string
          description: Name of the model to evaluate
          example: Qwen/Qwen3.5-9B
        max_tokens:
          type: integer
          minimum: 1
          description: Maximum number of tokens to generate.
          example: 512
        temperature:
          type: number
          minimum: 0
          maximum: 2
          description: Sampling temperature for generation.
          example: 0.7
        system_template:
          type: string
          description: >-
            System prompt template. Supports Jinja2 variables referencing
            dataset columns.
          example: You are a helpful assistant.
        input_template:
          type: string
          description: >-
            User message template. Supports Jinja2 variables referencing dataset
            columns.
          example: 'Please answer the following question: {{ question }}'
        model_source:
          type: string
          description: >
            Source of the model inference: - `serverless`: Together's shared
            serverless inference API. Default concurrency: 25 workers. -
            `dedicated`: A Together dedicated deployment endpoint. Default
            concurrency: 5 workers
              (minimum enforced even if num_workers is set lower). Authentication uses the requesting
              user's Together API token automatically.
            - `external`: An external inference API (e.g. OpenAI, Anthropic,
            Google, OpenRouter).
              Requires `external_api_token` and `external_base_url`. Default concurrency: 2 workers
              for first-party APIs (OpenAI, Anthropic, Google), 20 for proxy/aggregator endpoints.
          enum:
            - serverless
            - dedicated
            - external
        external_api_token:
          type: string
          description: >-
            Bearer/API token for the external model provider. Required when
            model_source is 'external'.
        external_base_url:
          type: string
          description: >-
            Base URL of the external inference API. Must be OpenAI-compatible.
            Required when model_source is 'external'.
        num_workers:
          type: integer
          minimum: 1
          description: >
            Number of concurrent inference workers. Overrides the
            source-specific default (serverless: 25, dedicated: 5, external:
            2–20). For dedicated endpoints the value is clamped to a minimum of
            5 regardless of what is set here.
          example: 5
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      x-bearer-format: bearer
      x-default: default

````