Scripted Simulations
While automatic simulations are powerful, sometimes you need precise control over the conversation flow. Scripted simulations let you orchestrate exactly how conversations unfold, when evaluations occur, and what custom logic runs at each step.
Script Basics
A script is a list of functions that control the conversation flow:
result = await scenario.run(
name="scripted interaction",
description="Test specific conversation patterns",
agents=[
MyAgent(),
scenario.UserSimulatorAgent(),
scenario.JudgeAgent(criteria=["Agent responds helpfully"])
],
script=[
scenario.user("Hello, I need help"),
scenario.agent(),
scenario.judge(),
]
)
Commands List
Command | Description |
---|---|
scenario.user() | Generate or add user messages |
scenario.agent() | Generate or add agent messages |
scenario.message() | Add any openai-message format messages to the conversation |
scenario.judge() | Force judge evaluation at specific points |
scenario.proceed() | Let the conversation flow automatically for a specified number of turns |
scenario.succeed() | End the scenario with a specific result |
scenario.fail() | End the scenario with a specific result |
Script Commands
scenario.user()
Generate or specify user messages:
script=[
# Specific user message
scenario.user("I want to cancel my subscription"),
# Let user simulator generate message based on scenario
scenario.user(),
# Structured message with additional content
scenario.message({
"role": "user",
"content": "What's in this image?",
"attachments": [{"type": "image", "url": "..."}]
})
]
scenario.agent()
Generate or specify agent responses:
script=[
scenario.user("Help me with billing"),
# Let agent generate response
scenario.agent(),
# Or specify exact response for testing
scenario.agent("I'll help you with billing. Can you provide your account number?"),
# Structured response with tool calls
scenario.message({
"role": "assistant",
"content": "Let me look up your account",
"tool_calls": [{"function": {"name": "lookup_account"}}]
})
]
scenario.message()
Add any OpenAI-compatible message directly to the conversation:
script=[
# User message
scenario.message({"role": "user", "content": "Hello, I need help"}),
# Assistant message
scenario.message({"role": "assistant", "content": "I'd be happy to help!"}),
# Assistant message with tool calls
scenario.message({
"role": "assistant",
"content": "Let me look that up for you",
"tool_calls": [{
"id": "call_123",
"type": "function",
"function": {
"name": "search_database",
"arguments": '{"query": "user question"}'
}
}]
}),
# Tool response
scenario.message({
"role": "tool",
"tool_call_id": "call_123",
"content": '{"results": ["result1", "result2"]}'
}),
# System message (for context injection)
scenario.message({
"role": "system",
"content": "The user is now in a hurry and needs quick responses"
}),
# Multimodal user message
scenario.message({
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{"type": "image_url", "image_url": {"url": "data:image/..."}}
]
})
]
scenario.judge()
Force judge evaluation at specific points:
script=[
scenario.user("I need help"),
scenario.agent(),
# Force judge to evaluate now
scenario.judge(),
# If conversation continues...
scenario.user(),
scenario.agent(),
scenario.judge() # Final evaluation
]
scenario.proceed()
Let the conversation flow automatically for a specified number of turns:
script=[
scenario.user("Start the conversation"),
scenario.agent(),
# Let it proceed for 3 turns
scenario.proceed(turns=3),
# Then take control again
scenario.user("Final question"),
scenario.agent(),
scenario.succeed()
]
scenario.succeed()
and scenario.fail()
End the scenario with a specific result:
script=[
scenario.user("Test message"),
scenario.agent(),
# End with success
scenario.succeed("Agent provided helpful response"),
# Or end with failure
scenario.fail("Agent did not meet requirements")
]
Custom Steps and Evaluations
Custom Assertion Functions
Add custom logic at any point in the conversation:
def check_tool_usage(state: scenario.ScenarioState) -> None:
"""Verify agent called the required tool"""
assert state.has_tool_call("get_weather"), "Agent should have called weather tool"
# Get the tool call details
weather_call = state.last_tool_call("get_weather")
if weather_call:
args = json.loads(weather_call["function"]["arguments"])
assert "location" in args, "Weather tool should include location"
def verify_response_quality(state: scenario.ScenarioState) -> None:
"""Check response meets quality standards"""
last_message = state.last_message()
content = last_message.get("content", "")
assert len(content) > 10, "Response should be substantial"
assert "sorry" not in content.lower(), "Agent shouldn't apologize unnecessarily"
script=[
scenario.user("What's the weather in Paris?"),
scenario.agent(),
check_tool_usage, # Custom assertion
verify_response_quality, # Another custom check
scenario.succeed()
]
Async Custom Steps
Custom steps can be async for external API calls:
async def external_evaluation(state: scenario.ScenarioState) -> None:
"""Run external evaluation service"""
last_response = state.last_message().get("content", "")
# Call external service
eval_result = await external_evaluator.evaluate(
input=state.last_user_message()["content"],
output=last_response
)
assert eval_result.score > 0.8, f"Quality score too low: {eval_result.score}"
script=[
scenario.user("Complex query"),
scenario.agent(),
external_evaluation, # Async custom step
scenario.proceed()
]
Conditional Logic
Use conditional logic to branch based on conversation state:
def conditional_check(state: scenario.ScenarioState) -> Optional[scenario.ScenarioResult]:
"""Branch logic based on agent response"""
last_message = state.last_message()
content = last_message.get("content", "").lower()
if "i don't know" in content:
return scenario.ScenarioResult(
success=False,
messages=state.messages,
reasoning="Agent should not give up so easily"
)
elif "let me help" in content:
# Continue the conversation
return None
else:
# Force a specific follow-up
state.add_message({
"role": "user",
"content": "Can you be more specific?"
})
return None
script=[
scenario.user("I have a problem"),
scenario.agent(),
conditional_check, # Returns ScenarioResult or None
scenario.proceed()
]
Starting from Existing History
Sometimes you want to test scenarios that begin mid-conversation:
Pre-populate Conversation History
result = await scenario.run(
name="mid-conversation booking",
description="User is in the middle of booking a flight and now wants to add hotels",
agents=[
TravelAgent(),
scenario.UserSimulatorAgent(),
scenario.JudgeAgent(criteria=["Agent handles additional requests smoothly"])
],
script=[
# Set up previous conversation context
scenario.message({"role": "user", "content": "I'd like to book a flight"}),
scenario.message({"role": "assistant", "content": "I'd be happy to help. Where would you like to go?"}),
scenario.message({"role": "user", "content": "I want to go to Paris"}),
scenario.message({"role": "assistant", "content": "Great! When would you like to travel?"}),
# Continue with new interaction
scenario.user("Actually, can you also help me find hotels?"),
scenario.agent(),
scenario.judge()
]
)
Load from External Sources
If you have a source with openai-message format history, you can easily load it into the conversation:
async def load_real_conversation(state: scenario.ScenarioState) -> None:
"""Load conversation from logs or database"""
conversation_data = await load_conversation_from_db("conversation_123")
for message in conversation_data:
state.add_message(message)
script=[
load_real_conversation, # Load real conversation
scenario.user("One more question..."), # Continue from there
scenario.agent(),
scenario.judge()
]
Advanced Flow Control
scenario.proceed()
with Callbacks
Monitor and control automatic conversation flow:
def log_turn_progress(state: scenario.ScenarioState) -> None:
"""Log progress after each turn"""
print(f"Turn {state.current_turn}: {len(state.messages)} messages")
# Check for concerning patterns
last_msg = state.last_message().get("content", "")
if "error" in last_msg.lower():
print("⚠️ Error detected in conversation")
def safety_check(state: scenario.ScenarioState) -> None:
"""Check each step for safety violations"""
last_msg = state.last_message()
if last_msg.get("role") == "assistant":
content = last_msg.get("content", "").lower()
if any(word in content for word in ["harmful", "dangerous", "illegal"]):
raise AssertionError("Safety violation detected")
script=[
scenario.user("Start conversation"),
scenario.agent(),
# Proceed with monitoring
scenario.proceed(
turns=5,
on_turn=log_turn_progress, # Called after each complete turn
on_step=safety_check # Called after each agent interaction
),
scenario.judge()
]
Dynamic Script Modification
Modify the script based on conversation state:
def dynamic_script_logic(state: scenario.ScenarioState) -> None:
"""Modify conversation flow based on agent behavior"""
last_msg = state.last_message().get("content", "")
if any(msg.get("tool_call_id") for msg in state.messages):
# Agent made a tool call, let's test error handling
state.add_message({
"role": "tool",
"content": json.dumps({"error": "Service unavailable"}),
"tool_call_id": "test_call_123"
})
# Continue based on agent's response to the error
if "apologize" in last_msg.lower():
# Agent apologized, test recovery
state.add_message({
"role": "user",
"content": "Is there another way to help me?"
})
script=[
scenario.user("I need current data"),
scenario.agent(),
dynamic_script_logic, # Inject tool error
scenario.agent(), # See how agent handles error
scenario.judge()
]
Combining Scripted and Automatic Flow
Mix precise control with automatic simulation:
script=[
# Start with specific setup
scenario.user("I need help with a complex issue"),
scenario.agent(),
# Verify initial response
lambda state: assert "help" in state.last_message().get("content", "").lower(),
# Let conversation flow naturally for a while
scenario.proceed(turns=3),
# Inject a complication
scenario.user("Actually, I need to change my original request"),
scenario.agent(),
# Verify adaptation
lambda state: assert len(state.last_message().get("content", "")) > 50,
# Let judge decide final outcome
scenario.judge()
]
Best Practices
1. Start Simple
Begin with basic scripts and add complexity gradually:
# Start with basic flow
script=[
scenario.user("Hello"),
scenario.agent(),
scenario.succeed()
]
# Then add validation
script=[
scenario.user("Hello"),
scenario.agent(),
lambda state: assert "hello" in state.last_message().get("content", "").lower(),
scenario.succeed()
]
2. Use Descriptive Function Names
Make your custom steps self-documenting:
def ensure_no_pii_leaked(state: scenario.ScenarioState) -> None:
content = state.last_message().get("content", "")
assert "George" not in content, \
"Agent should not mention the user's real name in responses"
def verify_agent_asks_for_clarification(state: scenario.ScenarioState) -> None:
content = state.last_message().get("content", "").lower()
assert any(word in content for word in ["what", "which", "how", "when", "where"]), \
"Agent should ask clarifying questions"
3. Balance Control and Realism
Don't over-script; leave room for natural conversation:
# Over-scripted - too rigid
script=[
scenario.user("Exact message 1"),
scenario.agent("Exact response 1"),
scenario.user("Exact message 2"),
scenario.agent(),
ensure_exact_response_2,
scenario.succeed()
]
# Better - mix of control and automation
script=[
scenario.user("I need help with billing"),
scenario.agent(), # Let agent respond naturally
lambda state: assert "account" in state.last_message().get("content", "").lower(),
scenario.proceed(turns=2), # Allow natural conversation
scenario.judge()
]
Integration with External Tools
LLM Evaluators
import litellm
async def llm_evaluator(state: scenario.ScenarioState) -> None:
"""Use LLM to evaluate response quality"""
conversation = "\n".join([
f"{msg['role']}: {msg.get('content', '')}"
for msg in state.messages[-4:] # Last 4 messages
])
evaluation_prompt = f"""
Evaluate this customer service conversation:
{conversation}
Is the agent response professional and helpful? Respond with just "yes" or "no".
"""
response = await litellm.completion(
model="openai/gpt-4o-mini",
messages=[{"role": "user", "content": evaluation_prompt}]
)
result = response.choices[0].message.content.strip().lower()
assert result == "yes", f"LLM evaluation failed: {result}"
script=[
scenario.user("I'm frustrated with my service"),
scenario.agent(),
llm_evaluator, # External LLM evaluation
scenario.proceed()
]
Database Validation
async def verify_database_update(state: scenario.ScenarioState) -> None:
"""Verify agent actually updated the database"""
# Extract customer ID from conversation
messages_text = str(state.messages)
# This would be actual database check
customer_updated = await check_customer_record_updated(
customer_id="extracted_from_conversation"
)
assert customer_updated, "Agent should have updated customer record"
script=[
scenario.user("Please update my contact information"),
scenario.agent(),
verify_database_update, # Verify actual system changes
scenario.succeed("Contact information updated successfully")
]
Mocking with pytest
import pytest
from unittest.mock import patch
@pytest.mark.asyncio
@patch('my_agent_module.external_weather_api')
async def test_weather_agent_with_mock(mock_weather_api):
"""Test agent behavior with mocked external service"""
# Set up mock response
mock_weather_api.get_weather.return_value = {
"temperature": "72°F",
"condition": "sunny",
"humidity": "45%"
}
class WeatherAgent(scenario.AgentAdapter):
async def call(self, input: scenario.AgentInput) -> str:
# This will use the mocked API
weather_data = external_weather_api.get_weather("New York")
return f"Weather in New York: {weather_data['temperature']}, {weather_data['condition']}"
result = await scenario.run(
name="mocked weather test",
description="User asks about weather",
agents=[
WeatherAgent(),
scenario.UserSimulatorAgent(),
scenario.JudgeAgent(criteria=["Agent provides weather information"])
]
)
# Verify the mock was called
mock_weather_api.get_weather.assert_called_once_with("New York")
assert result.success
Next Steps
Now that you understand scripted simulations, explore other advanced features:
- Cache - Make your tests deterministic and faster
- Debug Mode - Debug scenarios interactively
- Agent Integration - Connect different types of agents