*{box-sizing:border-box;margin:0;padding:0}

body{font-family:'Segoe UI',system-ui,sans-serif;background:var(--bg);color:var(--text);min-height:100px;padding:20px;line-height:1.6}
.wrap{max-width:900px;margin:0 auto}
h1{text-align:center;font-size:1.8rem;margin:8px 0 4px;background:linear-gradient(90deg,var(--accent),var(--accent2));-webkit-background-clip:text;-webkit-text-fill-color:transparent}
.sub{text-align:center;color:var(--muted);margin-bottom:24px;font-size:.95rem}
.grid{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:24px}
@media(max-width:640px){.grid{grid-template-columns:1fr}}
.cat{background:var(--card);border-radius:12px;padding:16px;border:1px solid #334155}
.cat h3{font-size:1rem;margin-bottom:12px;padding-bottom:8px;border-bottom:1px solid #334155}
.cat h3 .emoji{margin-right:8px}
.q{display:flex;align-items:center;justify-content:space-between;padding:8px 0;border-bottom:1px solid #1e293b}
.q:last-child{border-bottom:none}
.q label{flex:1;font-size:.85rem;padding-right:12px;color:cbd5e1}
.q select{background:#0f172a;border:1px solid #475569;border-radius:6px;color:var(--text);padding:6px 10px;font-size:.85rem;min-width:70px;cursor:pointer}
.q select:focus{outline:none;border-color:var(--accent)}
.score-bar{height:6px;background:#334155;border-radius:3px;margin-top:4px;overflow:hidden}
.score-fill{height:100%;border-radius:3px;transition:width .5s ease}
.result{background:linear-gradient(135deg,var(--card),#1a2234);border-radius:16px;padding:24px;text-align:center;border:1px solid #334155;margin-bottom:24px}
.score-big{font-size:3.5rem;font-weight:700;background:linear-gradient(90deg,var(--accent),var(--accent2));-webkit-background-clip:text;-webkit-text-fill-color:transparent}
.score-label{color:var(--muted);font-size:.9rem;margin-top:4px}
.verdict{font-size:1.1rem;margin:12px 0;padding:12px;border-radius:8px;font-weight:600}
.verdict.prod{background:rgba(16,185,129,.15);color:var(--green);border:1px solid rgba(16,185,129,.3)}
.verdict.pilot{background:rgba(245,158,11,.15);color:var(--yellow);border:1px solid rgba(245,158,11,.3)}
.verdict.research{background:rgba(239,68,68,.15);color:var(--red);border:1px solid rgba(239,68,68,.3)}
.breakdown{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:8px;margin:16px 0}
.bd{background:#1a2234;border-radius:8px;padding:10px;text-align:center}
.bd .bd-score{font-size:1.4rem;font-weight:700}
.bd .bd-label{font-size:.75rem;color:var(--muted)}
.suggestions{text-align:left;margin-top:16px}
.suggestions h4{margin-bottom:8px;font-size:.95rem}
.suggestions li{padding:4px 0;font-size:.85rem;color:cbd5e1}
.btn{background:var(--accent);color:#fff;border:none;padding:10px 24px;border-radius:8px;font-size:.9rem;cursor:pointer;margin-top:12px}
.btn:hover{background:#2563eb}

d.innerHTML=`${cat.emoji}${cat.name} (${cat.weight}%)`;

qd.innerHTML=`${ques}

${labels.map((l,i)=>`${l}`).join('')}

document.getElementById('result').scrollIntoView({behavior:'smooth'});

s.innerHTML='🎯 Priority improvements:'+weak.map(w=>`Strengthen ${w.name} — lowest scoring category`).join('')+'';

document.getElementById(`fill${ci}`).style.width='0%';

const categories=[

const labels=["Not implemented","Partial","Basic","Good","Excellent"];
const values=[0,1,2.5,4,5];

function buildQuiz(){
const q=document.getElementById('quiz');

const d=document.createElement('div');

const id=`c${ci}q${qi}`;
const qd=document.createElement('div');

const lb=document.createElement('div');
lb.id=`bar${ci}`;

function liveScore(){
let total=0,totalW=0;

let catSum=0,catCount=0;

const v=parseFloat(document.getElementById(`c${ci}q${qi}`).value);
if(v>=0){catSum+=v;catCount++}

const catMax=catCount*5;
const pct=catMax>0?(catSum/catMax)*100:0;
document.getElementById(`fill${ci}`).style.width=pct+'%';
const colors=['#ef4444','#f59e0b','#3b82f6','#10b981'];
const ci2=pct>=80?3:pct>=60?2:pct>=30?1:0;
document.getElementById(`fill${ci}`).style.background=colors[ci2];

const score=totalW>0?Math.round(total/totalW*100):0;

function showVerdict(score,details){
const v=document.getElementById('verdict');
const b=document.getElementById('breakdown');
const s=document.getElementById('suggestions');
if(score>=80){v.className='verdict prod';v.textContent='✅ PRODUCTION READY — This agent meets criteria for autonomous deployment with monitoring.'}
else if(score>=60){v.className='verdict pilot';v.textContent='🟡 PILOT READY — Deploy in a controlled pilot with human oversight. Address gaps before full rollout.'}
else{v.className='verdict research';v.textContent='🔴 RESEARCH STAGE — Significant gaps remain. Focus on core reliability and safety before any deployment.'}

let catSum=0,catCount=0;

const v2=parseFloat(document.getElementById(`c${ci}q${qi}`).value);
if(v2>=0){catSum+=v2;catCount++}

const pct=catCount>0?Math.round(catSum/(catCount*5)*100):0;
const colors={high:'var(--green)',mid:'var(--yellow)',low:'var(--red)'};
const c=pct>=70?'high':pct>=40?'mid':'low';
return `${pct}%
${cat.name}`;

const weak=categories.filter((cat,ci)=>{
let catSum=0,catCount=0;

const v2=parseFloat(document.getElementById(`c${ci}q${qi}`).value);
if(v2>=0){catSum+=v2;catCount++}

return catCount>0&&(catSum/(catCount*5))<0.5;

function resetQuiz(){

AI Agent Evaluation Framework

:root{
–bg:#0f172a;–card:#1e293b;–accent:#3b82f6;–accent2:#8b5cf6;
–green:#10b981;–yellow:#f59e0b;–red:#ef4444;–text:#e2e8f0;–muted:#94a3b8;
}

🤖 AI Agent Evaluation Framework

Score your AI agent across 24 criteria to determine production readiness

{emoji:“🎯“,name:“Task Completion“,weight:25,questions:[
„Does the agent complete the primary task successfully >90% of the time?“,
„Does the agent handle edge cases gracefully (no crashes on unexpected inputs)?“,
„Does the agent produce outputs in the correct format without manual cleanup?“,
„Does the agent know when to ask for clarification vs. guessing?“
]},
{emoji:“🔧“,name:“Reliability“,weight:20,questions:[
„Does the agent give consistent results for the same input (low variance)?“,
„Does the agent handle API/tool failures with retry or fallback logic?“,
„Are agent responses reproducible within acceptable tolerance?“,
„Does the agent have a human-in-the-loop escalation path?“
]},
{emoji:“🧠“,name:“Reasoning“,weight:20,questions:[
„Does the agent break complex tasks into logical steps?“,
„Can the agent explain its reasoning (chain-of-thought transparency)?“,
„Does the agent avoid common hallucinations in its domain?“,
„Does the agent correctly handle multi-step dependencies?“
]},
{emoji:“🔒“,name:“Safety & Grounding“,weight:15,questions:[
„Does the agent refuse clearly inappropriate requests?“,
„Are tool calls scoped to minimum required permissions?“,
„Does the agent’s RAG retrieval show source attribution?“,
„Does the agent avoid leaking sensitive context between sessions?“
]},
{emoji:“📊“,name:“Observability“,weight:10,questions:[
„Are all agent actions and tool calls logged with timestamps?“,
„Can you trace a failure back to a specific step or retrieval failure?“,
„Are token costs and latency tracked per task?“,
„Is there a monitoring dashboard for agent health?“
]},
{emoji:“🚀“,name:“Scalability“,weight:10,questions:[
„Can the agent handle 10x current task volume without degradation?“,
„Is the memory/retrieval system indexed for sub-100ms queries?“,
„Can multiple instances run concurrently without conflicts?“,
„Is the agent architecture model-agnostic (swappable LLM backend)?“
]}
];

categories.forEach((cat,ci)=>{

d.className=’cat‘;

cat.questions.forEach((ques,qi)=>{

qd.className=’q‘;

—

`;
d.appendChild(qd);
});

lb.className=’score-bar‘;
lb.innerHTML=‘

‚;
d.appendChild(lb);
q.appendChild(d);
});
}

categories.forEach((cat,ci)=>{

cat.questions.forEach((_,qi)=>{

});

total+=catSum/5*cat.weight;
totalW+=catCount>0?cat.weight:0;
});

document.getElementById(’scoreBig‘).textContent=score;
showVerdict(score, total);
document.getElementById(‚result‘).style.display=’block‘;

}

// breakdown
b.innerHTML=categories.map((cat,ci)=>{

cat.questions.forEach((_,qi)=>{

});

}).join(“);
// suggestions

cat.questions.forEach((_,qi)=>{

});

});
if(weak.length){

} else {
s.innerHTML=‘

✨ Strong across all categories!

Focus on continuous monitoring and A/B testing to maintain quality at scale.

‚;
}
}

document.querySelectorAll(’select‘).sel=>sel.value=-1);
categories.forEach((_,ci)=>{

});
document.getElementById(‚result‘).style.display=’none‘;
document.getElementById(’scoreBig‘).textContent=’0′;
}

buildQuiz();

🤖 AI Agent Evaluation Framework

✨ Strong across all categories!

Related Articles

Schreibe einen Kommentar Antwort abbrechen