Publié April 3, 2026
6 min de lecture
Enterprise buyers don't ask "is your AI safe?" They ask "prove it." And they don't want a slide deck with best intentions—they want artifacts: test results, configuration exports, monitoring dashboards, incident response procedures, and a clear mapping from your controls to their compliance requirements.
If you've already implemented guardrails, monitoring, and red team testing, most of the evidence already exists. You just need to package it in a way that procurement teams and auditors can evaluate.
After working with multiple enterprise procurement processes, the requests fall into five categories:
1. Security posture documentation
2. Test results and coverage metrics
3. Monitoring and incident response
4. Data handling and privacy
5. Compliance mapping
The 48-hour baseline assessment produces a structured report that doubles as the foundation of your evidence package:
from dataclasses import dataclass, field
from datetime import datetime
@dataclass
class Finding:
id: str
category: str
severity: str # critical, high, medium, low
title: str
description: str
evidence: str # what was observed
remediation: str
status: str # open, mitigated, accepted
@dataclass
class BaselineReport:
"""Structured baseline assessment output."""
assessment_date: str
system_description: str
scope: list[str]
methodology: str
findings: list[Finding]
risk_summary: dict # counts by severity
recommendations: list[str]
def to_evidence_format(self) -> dict:
return {
"document_type": "LLM Safety Baseline Assessment",
"date": self.assessment_date,
"scope": self.scope,
"methodology": self.methodology,
"finding_count": {
"critical": len([f for f in self.findings if f.severity == "critical"]),
"high": len([f for f in self.findings if f.severity == "high"]),
"medium": len([f for f in self.findings if f.severity == "medium"]),
"low": len([f for f in self.findings if f.severity == "low"]),
},
"findings": [
{
"id": f.id,
"category": f.category,
"severity": f.severity,
"title": f.title,
"status": f.status,
}
for f in self.findings
],
"overall_risk_level": self.risk_summary.get("overall", "not_assessed"),
}Show exactly what safety controls are in place and how they're configured:
@dataclass
class GuardrailConfig:
name: str
type: str # input, output, both
description: str
enabled: bool
threshold: float
on_fail_action: str # block, filter, log, alert
last_updated: str
test_coverage: float # percentage of test suite that covers this guardrail
class GuardrailConfigExporter:
"""Exports guardrail configuration as evidence documentation."""
def __init__(self, guardrail_registry):
self.registry = guardrail_registry
def export(self) -> dict:
configs = self.registry.get_all_configs()
return {
"document_type": "Guardrail Configuration Report",
"generated_at": datetime.utcnow().isoformat(),
"total_guardrails": len(configs),
"active_guardrails": len([c for c in configs if c.enabled]),
"coverage": {
"input_guardrails": len([c for c in configs if c.type in ("input", "both")]),
"output_guardrails": len([c for c in configs if c.type in ("output", "both")]),
},
"guardrails": [
{
"name": c.name,
"type": c.type,
"description": c.description,
"enabled": c.enabled,
"threshold": c.threshold,
"on_fail_action": c.on_fail_action,
"last_updated": c.last_updated,
"test_coverage": f"{c.test_coverage:.0%}",
}
for c in configs
],
}Buyers want to see what was tested, how it was tested, and what the results were:
class RedTeamReportGenerator:
"""Generates buyer-facing red team assessment reports."""
def generate(self, test_results: list[dict], assessment_metadata: dict) -> dict:
# Aggregate results by category
categories = {}
for result in test_results:
cat = result["category"]
if cat not in categories:
categories[cat] = {"total": 0, "passed": 0, "failed": 0}
categories[cat]["total"] += 1
if result["passed"]:
categories[cat]["passed"] += 1
else:
categories[cat]["failed"] += 1
return {
"document_type": "Red Team Assessment Report",
"assessment_date": assessment_metadata["date"],
"methodology": assessment_metadata["methodology"],
"test_suite_version": assessment_metadata["suite_version"],
"summary": {
"total_tests": len(test_results),
"passed": sum(1 for r in test_results if r["passed"]),
"failed": sum(1 for r in test_results if not r["passed"]),
"overall_pass_rate": f"{sum(1 for r in test_results if r['passed']) / len(test_results):.1%}",
},
"by_category": {
cat: {
"tests_run": data["total"],
"passed": data["passed"],
"pass_rate": f"{data['passed'] / data['total']:.1%}",
}
for cat, data in categories.items()
},
"false_positive_rate": assessment_metadata.get("false_positive_rate", "not_measured"),
"retest_cadence": "Every deployment + monthly comprehensive",
}Buyers need to know what happens when something goes wrong. A safety incident response playbook covers:
INCIDENT_RESPONSE_PLAYBOOK = {
"document_type": "LLM Safety Incident Response Playbook",
"severity_levels": {
"P0_critical": {
"definition": "Active data leakage, successful prompt injection with data exfiltration, or safety bypass affecting multiple users",
"response_time": "15 minutes",
"actions": [
"Activate circuit breaker (disable LLM endpoint)",
"Notify security team and incident commander",
"Preserve audit logs for investigation",
"Notify affected users within 24 hours",
],
},
"P1_high": {
"definition": "Guardrail bypass detected, PII exposure in single response, or cost spike exceeding 5x baseline",
"response_time": "1 hour",
"actions": [
"Tighten guardrail thresholds",
"Review audit logs for scope assessment",
"Add finding to regression test suite",
"Root cause analysis within 24 hours",
],
},
"P2_medium": {
"definition": "New jailbreak technique succeeds, hallucination rate increase, or false positive rate spike",
"response_time": "4 hours",
"actions": [
"Update detection rules",
"Add test cases to regression suite",
"Monitor for recurrence",
],
},
},
"communication_channels": {
"internal": "Dedicated Slack channel + PagerDuty",
"external": "Status page + email notification",
},
"post_incident_review": "Required for all P0 and P1 incidents within 72 hours",
}The most persuasive artifact in an evidence package is a clear mapping from your technical controls to the buyer's compliance requirements.
| Requirement | Framework | Your Control | Evidence |
|---|---|---|---|
| Processing integrity | SOC 2 PI1.1 | Input/output guardrails | Guardrail config export + test results |
| Data confidentiality | SOC 2 C1.1 | PII detection/redaction | PII test suite pass rate |
| Incident response | SOC 2 CC7.3 | Safety incident playbook | Playbook document + drill records |
| Data minimization | GDPR Art. 5(1)(c) | Context window limits, PII redaction | Configuration export + audit logs |
| Right to erasure | GDPR Art. 17 | Audit log retention policy | Data retention documentation |
| Breach notification | GDPR Art. 33 | P0 incident response process | Playbook + notification templates |
| AI transparency | Industry-specific | Model card, guardrail documentation | Full evidence package |
The best evidence packages are generated automatically from your existing systems—not assembled manually before each audit:
class EvidenceCollector:
"""Automatically collects safety evidence from existing systems."""
def __init__(self, monitoring_client, test_runner, audit_store, config_registry):
self.monitoring = monitoring_client
self.tests = test_runner
self.audit = audit_store
self.config = config_registry
async def collect(self, period_start: str, period_end: str) -> dict:
"""Collect all evidence for the specified period."""
return {
"generated_at": datetime.utcnow().isoformat(),
"period": {"start": period_start, "end": period_end},
"guardrail_config": GuardrailConfigExporter(self.config).export(),
"test_results": await self._collect_test_evidence(period_start, period_end),
"monitoring_summary": await self._collect_monitoring_evidence(period_start, period_end),
"incident_summary": await self._collect_incident_evidence(period_start, period_end),
"audit_coverage": await self._collect_audit_evidence(period_start, period_end),
}
async def _collect_test_evidence(self, start: str, end: str) -> dict:
results = await self.tests.get_results(start, end)
return {
"total_test_runs": len(results),
"latest_pass_rate": results[-1]["pass_rate"] if results else "no_data",
"regression_trend": self._calculate_trend(results),
"coverage_by_category": self._aggregate_coverage(results),
}
async def _collect_monitoring_evidence(self, start: str, end: str) -> dict:
return {
"guardrail_events": await self.monitoring.get_guardrail_events(start, end),
"cost_summary": await self.monitoring.get_cost_summary(start, end),
"safety_scores": await self.monitoring.get_safety_scores(start, end),
"drift_alerts": await self.monitoring.get_drift_alerts(start, end),
}
async def _collect_incident_evidence(self, start: str, end: str) -> dict:
incidents = await self.audit.get_incidents(start, end)
return {
"total_incidents": len(incidents),
"by_severity": self._count_by_severity(incidents),
"mean_time_to_detect": self._calculate_mttd(incidents),
"mean_time_to_resolve": self._calculate_mttr(incidents),
"all_resolved": all(i["status"] == "resolved" for i in incidents),
}
async def _collect_audit_evidence(self, start: str, end: str) -> dict:
return {
"total_interactions_logged": await self.audit.count_interactions(start, end),
"audit_coverage": "100%", # all interactions are logged
"retention_policy": "90 days",
"tamper_protection": "append-only store with hash chain",
}Run evidence collection quarterly for ongoing compliance, or on-demand when a buyer requests documentation. The collection process should take minutes, not days.
The strongest evidence artifact is a completed baseline assessment on the buyer's own system or a system similar to theirs. It demonstrates:
When a buyer asks "how do we know this works?", the baseline report is the answer. It's concrete, specific, and grounded in their actual risk profile—not generic best practices.
Enterprise buyers aren't buying your guardrails. They're buying proof that your guardrails work, keep working, and can be audited when questions arise.
The monitoring infrastructure and regression testing described in earlier articles provide the raw data. This article is about packaging that data into evidence that enterprise buyers trust.