body{font-family:-apple-system,BlinkMacSystemFont,’Segoe UI‘,Roboto,sans-serif;max-width:900px;margin:0 auto;padding:2rem;line-height:1.7;color:#1a1a1a}
h1{color:#1a1a1a;border-bottom:3px solid #6366f1;padding-bottom:.5rem}
h2{color:#334155;margin-top:2rem}
code{background:#f1f5f9;padding:.2rem .5rem;border-radius:4px;font-size:.9em}
pre{background:#1e293b;color:#e2e8f0;padding:1.5rem;border-radius:8px;overflow-x:auto;font-size:.9em}
blockquote{border-left:4px solid #6366f1;padding-left:1rem;color:#64748b;font-style:italic}
table{border-collapse:collapse;width:100%;margin:1rem 0}
th,td{border:1px solid #e2e8f0;padding:.75rem;text-align:left}
th{background:#f8fafc}
.tag{display:inline-block;background:#e0e7ff;color:#4338ca;padding:.2rem .6rem;border-radius:999px;font-size:.85em;margin-right:.5rem}
Agent Reflection Patterns: Self-Critique and Iterative Improvement
Reviewed: June 4, 2026
Published: May 26, 2026 | Reading time: 12 min | Agent Architecture Reflection Self-Improvement
The difference between a basic chatbot and a true AI agent? Reflection. Agents that can evaluate their own outputs, identify errors, and iteratively refine their work consistently outperform single-pass systems by 40-60% on complex tasks.
What is Agent Reflection?
Reflection is the ability of an AI agent to observe its own reasoning process, evaluate the quality of its outputs, and make corrections — without human intervention. It mirrors how human experts work: draft, review, revise.
In 2026, reflection has evolved from a research curiosity into a production necessity. The best agent frameworks (LangGraph, CrewAI, AutoGen) all include reflection primitives.
Pattern 1: Self-Evaluation Loop
The simplest reflection pattern: the agent generates a response, then evaluates it against the original request.
from dataclasses import dataclass
from typing import Optional
@dataclass
class ReflectionResult:
original_output: str
critique: str
revised_output: str
quality_score: float # 0-1
iterations: int
class SelfEvaluatingAgent:
"""Agent that evaluates and refines its own outputs."""
def __init__(self, llm, max_iterations: int = 3, quality_threshold: float = 0.8):
self.llm = llm
self.max_iterations = max_iterations
self.quality_threshold = quality_threshold
def generate_with_reflection(self, task: str, context: str = "") -> ReflectionResult:
# Step 1: Generate initial output
output = self.llm.generate(task, context)
for iteration in range(1, self.max_iterations + 1):
# Step 2: Self-evaluate
critique_prompt = f"""
Evaluate this response for the task: {task}
Response: {output}
Rate quality 0-10 and list specific issues:
- Accuracy: Is the information correct?
- Completeness: Does it fully address the task?
- Clarity: Is it well-structured and readable?
- Actionability: Can the user act on this?
Output format: {{"score": N, "issues": ["issue1", "issue2"]}}
"""
critique = self.llm.generate(critique_prompt)
score = self._extract_score(critique)
# Step 3: If quality is high enough, stop
if score / 10 >= self.quality_threshold:
break
# Step 4: Revise based on critique
revise_prompt = f"""
Original task: {task}
Current response: {output}
Issues identified: {critique}
Please rewrite the response addressing all issues.
"""
output = self.llm.generate(revise_prompt)
return ReflectionResult(
original_output=output,
critique=critique,
revised_output=output,
quality_score=score / 10,
iterations=iteration
)
def _extract_score(self, critique_text: str) -> float:
import re
match = re.search(r'"score":s*(d+)', critique_text)
return float(match.group(1)) if match else 5.0
Pattern 2: Multi-Panel Review
Use multiple „reviewer“ agents with different specializations to critique the same output:
class MultiPanelReview:
"""Multiple specialist reviewers evaluate the same output."""
REVIEWER_PERSONAS = {
"accuracy_expert": "You are a fact-checking expert. Focus only on factual accuracy and correctness.",
"clarity_expert": "You are a technical writing expert. Focus on clarity, structure, and readability.",
"completeness_expert": "You are a requirements analyst. Focus on whether all aspects of the task are addressed.",
"security_expert": "You are a security engineer. Focus on safety, data handling, and potential risks."
}
def __init__(self, llm):
self.llm = llm
def review(self, task: str, output: str) -> dict:
reviews = {}
for role, persona in self.REVIEWER_PERSONAS.items():
review_prompt = f"""{persona}
Task: {task}
Output to review: {output}
Provide:
1. Rating (1-10)
2. Top 3 issues (if any)
3. Suggested improvements
Format: {{"rating": N, "issues": [...], "suggestions": [...]}}
"""
review = self.llm.generate(review_prompt)
reviews[role] = self._parse_review(review)
# Aggregate reviews
avg_rating = sum(r["rating"] for r in reviews.values()) / len(reviews)
all_issues = []
for r in reviews.values():
all_issues.extend(r["issues"])
return {
"reviews": reviews,
"average_rating": avg_rating,
"all_issues": all_issues,
"passes": avg_rating >= 7.0
}
def _parse_review(self, text: str) -> dict:
import re, json
try:
structured = json.loads(text)
return {"rating": structured.get("rating", 5),
"issues": structured.get("issues", []),
"suggestions": structured.get("suggestions", [])}
except:
return {"rating": 5, "issues": [], "suggestions": []}
Pattern 3: External Benchmark Comparison
Compare outputs against known-good references:
class BenchmarkComparison:
"""Compare agent outputs against gold-standard references."""
def __init__(self, reference_outputs: dict[str, str]):
self.references = reference_outputs # task_id -> ideal_output
def evaluate(self, task_id: str, agent_output: str) -> dict:
if task_id not in self.references:
return {"comparable": False, "reason": "No reference available"}
reference = self.references[task_id]
# Word overlap similarity
ref_words = set(reference.lower().split())
out_words = set(agent_output.lower().split())
overlap = len(ref_words & out_words) / len(ref_words) if ref_words else 0
# Key concept coverage
import re
key_concepts = re.findall(r'b[A-Z][a-z]+(?:s+[A-Z][a-z]+)*b', reference)
covered = sum(1 for c in key_concepts if c.lower() in agent_output.lower())
concept_coverage = covered / len(key_concepts) if key_concepts else 0
return {
"comparable": True,
"word_overlap": overlap,
"concept_coverage": concept_coverage,
"overall_similarity": (overlap + concept_coverage) / 2,
"meets_threshold": concept_coverage >= 0.7
}
Pattern 4: Iterative Refinement with Memory
Learn from past reflections to improve future outputs:
class ReflectiveAgentWithMemory:
"""Agent that learns from past reflection patterns."""
def __init__(self, llm, reflection_memory_path: str = "reflections.json"):
self.llm = llm
self.memory_path = reflection_memory_path
self.past_reflections = self._load_memory()
def generate_and_reflect(self, task: str, task_type: str = "general") -> dict:
# Retrieve relevant past reflections
relevant = [r for r in self.past_reflections
if r.get("task_type") == task_type]
common_issues = self._extract_common_issues(relevant)
# Generate with awareness of past mistakes
warning = ""
if common_issues:
warning = f"nCommon issues to avoid (from past reflections): {common_issues}"
output = self.llm.generate(task + warning)
# Reflect
reflection = self.llm.generate(f"Analyze this output for issues: {output}")
# Store reflection
self._store_reflection({
"task_type": task_type,
"issues_found": reflection,
"timestamp": datetime.utcnow().isoformat()
})
return {
"output": output,
"reflection": reflection,
"past_issues_avoided": len(common_issues)
}
def _extract_common_issues(self, reflections: list[dict]) -> list[str]:
from collections import Counter
all_issues = []
for r in reflections[-20:]: # Last 20
issues = r.get("issues_found", "").split("n")
all_issues.extend(i.strip() for i in issues if i.strip())
counter = Counter(all_issues)
return [issue for issue, count in counter.most_common(5) if count >= 2]
def _load_memory(self) -> list[dict]:
try:
with open(self.memory_path) as f:
return json.load(f)
except:
return []
def _store_reflection(self, entry: dict):
self.past_reflections.append(entry)
with open(self.memory_path, 'w') as f:
json.dump(self.past_reflections[-100:], f)
When to Use Each Pattern
| Pattern | Best For | Cost | Complexity |
|---|---|---|---|
| Self-Evaluation Loop | Single-agent quality improvement | 2-3x tokens | Low |
| Multi-Panel Review | High-stakes outputs (medical, legal, code) | 4-5x tokens | Medium |
| Benchmark Comparison | Regression testing, known-good outputs | 1x tokens | Low |
| Iterative + Memory | Long-running agents, repeated task types | 2x tokens | Medium |
Part of the Agent Architecture series on DataGate.ch.
