Iterative Workflow

Iteratively call LLMs to optimize task performance.

The iterative workflow ensures task requirements are fully met through iterative refinement. An LLM performs a task, followed by a second LLM evaluating whether the result satisfies all specified criteria. If not, the process repeats with adjustments, continuing until the evaluator confirms all requirements are met.

Workflow Architecture

Build an agent that iteratively improves responses.

Setup Client & Helper Functions

import json
from pydantic import ValidationError
from together import Together

client = Together()

def run_llm(user_prompt : str, model : str, system_prompt : str = None):
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    
    messages.append({"role": "user", "content": user_prompt})
    
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.7,
        max_tokens=4000,        
    )

    return response.choices[0].message.content

def JSON_llm(user_prompt : str, schema, system_prompt : str = None):
    try:
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
    
        messages.append({"role": "user", "content": user_prompt})
        
        extract = client.chat.completions.create(
            messages=messages,
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            response_format={
                "type": "json_object",
                "schema": schema.model_json_schema(),
            },
        )
        return json.loads(extract.choices[0].message.content)

    except ValidationError as e:
        error_message = f"Failed to parse JSON: {e}"
        print(error_message)
import assert from "node:assert";
import Together from "together-ai";
import { Schema } from "zod";
import zodToJsonSchema from "zod-to-json-schema";

const client = new Together();

export async function runLLM(userPrompt: string, model: string) {
  const response = await client.chat.completions.create({
    model,
    messages: [{ role: "user", content: userPrompt }],
    temperature: 0.7,
    max_tokens: 4000,
  });

  const content = response.choices[0].message?.content;
  assert(typeof content === "string");
  return content;
}

export async function jsonLLM<T>(
  userPrompt: string,
  schema: Schema<T>,
  systemPrompt?: string,
) {
  const messages: { role: "system" | "user"; content: string }[] = [];
  if (systemPrompt) {
    messages.push({ role: "system", content: systemPrompt });
  }

  messages.push({ role: "user", content: userPrompt });

  const response = await client.chat.completions.create({
    model: "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
    messages,
    response_format: {
      type: "json_object",
      // @ts-expect-error Expected error
      schema: zodToJsonSchema(schema, {
        target: "openAi",
      }),
    },
  });

  const content = response.choices[0].message?.content;
  assert(typeof content === "string");

  return schema.parse(JSON.parse(content));
}

Implement Workflow

from pydantic import  BaseModel
from typing import Literal

GENERATOR_PROMPT = """
Your goal is to complete the task based on <user input>. If there are feedback 
from your previous generations, you should reflect on them to improve your solution

Output your answer concisely in the following format: 

Thoughts:
[Your understanding of the task and feedback and how you plan to improve]

Response:
[Your code implementation here]
"""

def generate(task: str, generator_prompt: str, context: str = "") -> tuple[str, str]:
    """Generate and improve a solution based on feedback."""
    full_prompt = f"{generator_prompt}\n{context}\nTask: {task}" if context else f"{generator_prompt}\nTask: {task}"

    response = run_llm(full_prompt, model="Qwen/Qwen2.5-Coder-32B-Instruct")
    
    print("\n## Generation start")
    print(f"Output:\n{response}\n")
    
    return response

EVALUATOR_PROMPT = """
Evaluate this following code implementation for:
1. code correctness
2. time complexity
3. style and best practices

You should be evaluating only and not attempting to solve the task.

Only output "PASS" if all criteria are met and you have no further suggestions for improvements.

Provide detailed feedback if there are areas that need improvement. You should specify what needs improvement and why.

Only output JSON.
"""

def evaluate(task : str, evaluator_prompt : str, generated_content: str, schema) -> tuple[str, str]:
    """Evaluate if a solution meets requirements."""
    full_prompt = f"{evaluator_prompt}\nOriginal task: {task}\nContent to evaluate: {generated_content}"

    #Build a schema for the evaluation
    class Evaluation(BaseModel):
        evaluation: Literal["PASS", "NEEDS_IMPROVEMENT", "FAIL"]
        feedback: str

    response = JSON_llm(full_prompt, Evaluation)
    
    evaluation = response["evaluation"]
    feedback = response["feedback"]

    print("## Evaluation start")
    print(f"Status: {evaluation}")
    print(f"Feedback: {feedback}")

    return evaluation, feedback

def loop_workflow(task: str, evaluator_prompt: str, generator_prompt: str) -> tuple[str, list[dict]]:
    """Keep generating and evaluating until the evaluator passes the last generated response."""
    # Store previous responses from generator
    memory = []
    
    # Generate initial response
    response = generate(task, generator_prompt)
    memory.append(response)

   
    # While the generated response is not passing, keep generating and evaluating
    while True:
        evaluation, feedback = evaluate(task, evaluator_prompt, response)
        # Terminating condition
        if evaluation == "PASS":
            return response
        
        # Add current response and feedback to context and generate a new response
        context = "\n".join([
            "Previous attempts:",
            *[f"- {m}" for m in memory],
            f"\nFeedback: {feedback}"
        ])
        
        response = generate(task, generator_prompt, context)
        memory.append(response)
import dedent from "dedent";
import { z } from "zod";

const GENERATOR_PROMPT = dedent`
  Your goal is to complete the task based on <user input>. If there is feedback 
  from your previous generations, you should reflect on them to improve your solution.

  Output your answer concisely in the following format: 

  Thoughts:
  [Your understanding of the task and feedback and how you plan to improve]

  Response:
  [Your code implementation here]
`;

/*
  Generate and improve a solution based on feedback.
*/
async function generate(task: string, generatorPrompt: string, context = "") {
  const fullPrompt = dedent`
    ${generatorPrompt}

    Task: ${task}

    ${context}
  `;

  const response = await runLLM(fullPrompt, "Qwen/Qwen2.5-Coder-32B-Instruct");
  console.log(dedent`
    ## Generation start

    ${response}
    \n
  `);

  return response;
}

const EVALUATOR_PROMPT = dedent`
  Evaluate this following code implementation for:

    1. code correctness
    2. time complexity
    3. style and best practices

  You should be evaluating only and not attempting to solve the task.

  Only output "PASS" if all criteria are met and you have no further suggestions for improvements.

  Provide detailed feedback if there are areas that need improvement. You should specify what needs improvement and why. Make sure to only use a single line without newlines for the feedback.

  Only output JSON.
`;

/*
  Evaluate if a solution meets the requirements.
*/
async function evaluate(
  task: string,
  evaluatorPrompt: string,
  generatedContent: string,
) {
  const fullPrompt = dedent`
    ${evaluatorPrompt}

    Original task: ${task}

    Content to evaluate: ${generatedContent}
  `;

  const schema = z.object({
    evaluation: z.enum(["PASS", "NEEDS_IMPROVEMENT", "FAIL"]),
    feedback: z.string(),
  });
  const { evaluation, feedback } = await jsonLLM(fullPrompt, schema);

  console.log(dedent`
    ## Evaluation start

    Status: ${evaluation}

    Feedback: ${feedback}
    \n
  `);

  return { evaluation, feedback };
}

/*
  Keep generating and evaluating until the evaluator passes the last generated response.
*/
async function loopWorkflow(
  task: string,
  evaluatorPrompt: string,
  generatorPrompt: string,
) {
  // Store previous responses from generator
  const memory = [];

  // Generate initial response
  let response = await generate(task, generatorPrompt);
  memory.push(response);

  while (true) {
    const { evaluation, feedback } = await evaluate(
      task,
      evaluatorPrompt,
      response,
    );

    if (evaluation === "PASS") {
      break;
    }

    const context = dedent`
      Previous attempts:

      ${memory.map((m, i) => `### Attempt ${i + 1}\n\n${m}`).join("\n\n")}

      Feedback: ${feedback}
    `;

    response = await generate(task, generatorPrompt, context);
    memory.push(response);
  }
}

Example Usage

task = """
Implement a Stack with:
1. push(x)
2. pop()
3. getMin()
All operations should be O(1).
"""

loop_workflow(task, EVALUATOR_PROMPT, GENERATOR_PROMPT)
const task = dedent`
  Implement a Stack with:

    1. push(x)
    2. pop()
    3. getMin()

  All operations should be O(1).
`;

loopWorkflow(task, EVALUATOR_PROMPT, GENERATOR_PROMPT);

Use cases

  • Generating code that meets specific requirements, such as ensuring runtime complexity.
  • Searching for information and using an evaluator to verify that the results include all the required details.
  • Writing a story or article with specific tone or style requirements and using an evaluator to ensure the output matches the desired criteria, such as adhering to a particular voice or narrative structure.
  • Generating structured data from unstructured input and using an evaluator to verify that the data is properly formatted, complete, and consistent.
  • Creating user interface text, like tooltips or error messages, and using an evaluator to confirm the text is concise, clear, and contextually appropriate.

📘

Iterative Workflow Cookbook

For a more detailed walk-through refer to the notebook here .