Published February 28, 2026
6 min read
Your RAG pipeline trusts every document it ingests. Attackers know this. While most RAG security discussions focus on retrieval and generation—how documents get used after they're in the vector database—the ingestion step is where the most dangerous payloads enter the system.
This article focuses specifically on what happens between document upload and vector storage: the parsing, extraction, and sanitization mechanics that determine whether a malicious document becomes a time bomb in your knowledge base.
For the broader RAG security picture—indirect prompt injection at retrieval time, data poisoning strategies, and defense-in-depth architecture—see RAG Security Fundamentals.
The typical RAG ingestion pipeline:
Security failures at step 2—text extraction—are the most dangerous because they determine what content enters the system. If a malicious payload survives extraction, it will be faithfully embedded, stored, and eventually retrieved.
PDFs are the most common document type in enterprise RAG systems—and the most dangerous. Their rich format allows content to be hidden in ways that text-based scanners miss entirely.
PDFs support multiple text layers. Content can be invisible to humans but fully readable by text extractors:
import fitz # PyMuPDF
def scan_pdf_hidden_text(pdf_path: str) -> list[dict]:
"""Detect text that is invisible to human readers but extractable."""
findings = []
doc = fitz.open(pdf_path)
for page_num, page in enumerate(doc):
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if "lines" not in block:
continue
for line in block["lines"]:
for span in line["spans"]:
text = span["text"].strip()
if not text:
continue
# White text on white background
if span["color"] == 16777215: # RGB white
findings.append({
"page": page_num + 1,
"text": text[:200],
"technique": "white_on_white",
"risk": "high",
})
# Zero-size font
if span["size"] < 0.5:
findings.append({
"page": page_num + 1,
"text": text[:200],
"technique": "zero_font_size",
"risk": "high",
})
# Check for off-page content (outside visible area)
page_rect = page.rect
for block in blocks:
if "bbox" in block:
bbox = fitz.Rect(block["bbox"])
if not page_rect.intersects(bbox):
findings.append({
"page": page_num + 1,
"text": str(block.get("lines", ""))[:200],
"technique": "off_page_content",
"risk": "high",
})
return findingsPDF metadata fields—title, author, subject, keywords, custom properties—are often extracted alongside content and included in chunks. Attackers embed injection payloads in these fields:
def scan_pdf_metadata(pdf_path: str, injection_patterns: list[str]) -> list[dict]:
"""Check PDF metadata fields for injection payloads."""
doc = fitz.open(pdf_path)
metadata = doc.metadata
findings = []
suspect_fields = ["title", "author", "subject", "keywords", "creator", "producer"]
for field in suspect_fields:
value = metadata.get(field, "")
if not value:
continue
# Check for common injection indicators
for pattern in injection_patterns:
if pattern.lower() in value.lower():
findings.append({
"field": field,
"value": value[:200],
"matched_pattern": pattern,
"risk": "high",
})
# Flag unusually long metadata (normal titles are <200 chars)
if len(value) > 300:
findings.append({
"field": field,
"value": value[:200] + "...",
"technique": "oversized_metadata",
"risk": "medium",
})
return findings
INJECTION_PATTERNS = [
"ignore previous",
"ignore above",
"system prompt",
"you are now",
"new instructions",
"disregard",
"forget everything",
"IMPORTANT:",
"SYSTEM:",
"ADMIN:",
]PDFs often contain embedded images with text that OCR will extract. An attacker can embed an image where the visible content looks like a normal chart, but OCR reads hidden text instructions layered underneath.
from PIL import Image
import pytesseract
import io
def extract_and_scan_pdf_images(pdf_path: str) -> list[dict]:
"""Extract images from PDF, OCR them, and scan for injection."""
doc = fitz.open(pdf_path)
findings = []
for page_num, page in enumerate(doc):
images = page.get_images(full=True)
for img_index, img in enumerate(images):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
# Convert to PIL Image for OCR
if pix.n >= 4: # CMYK
pix = fitz.Pixmap(fitz.csRGB, pix)
img_data = pix.tobytes("png")
pil_image = Image.open(io.BytesIO(img_data))
# OCR the image
ocr_text = pytesseract.image_to_string(pil_image)
# Scan OCR output for injection patterns
for pattern in INJECTION_PATTERNS:
if pattern.lower() in ocr_text.lower():
findings.append({
"page": page_num + 1,
"image_index": img_index,
"ocr_text": ocr_text[:300],
"matched_pattern": pattern,
"risk": "critical",
})
return findingsSpreadsheets introduce a unique attack vector: formula injection that becomes prompt injection after text extraction.
def scan_spreadsheet_cells(file_path: str) -> list[dict]:
"""Scan spreadsheet cells for injection payloads."""
import openpyxl
wb = openpyxl.load_workbook(file_path, data_only=False)
findings = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
for row in ws.iter_rows():
for cell in row:
if cell.value is None:
continue
value = str(cell.value)
# Check for injection in cell content
for pattern in INJECTION_PATTERNS:
if pattern.lower() in value.lower():
findings.append({
"sheet": sheet_name,
"cell": cell.coordinate,
"value": value[:200],
"matched_pattern": pattern,
"risk": "high",
})
# Check for formulas that could execute or hide content
if value.startswith(("=", "+", "-", "@")):
findings.append({
"sheet": sheet_name,
"cell": cell.coordinate,
"value": value[:200],
"technique": "formula_injection",
"risk": "medium",
})
return findingsWhen a spreadsheet containing =CONCATENATE("Ignore all previous ", "instructions") is extracted to plain text, the formula resolves to a prompt injection payload. Your text extractor evaluates the formula, and the resulting text enters your knowledge base clean.
Strip everything suspicious before extraction. Re-encode to remove hidden layers.
from dataclasses import dataclass
from typing import Optional
import tempfile
import subprocess
@dataclass
class SanitizationResult:
original_path: str
sanitized_path: str
findings: list[dict]
blocked: bool
block_reason: Optional[str] = None
class DocumentSanitizer:
"""Sanitizes documents before RAG ingestion."""
def __init__(self, max_file_size_mb: int = 50):
self.max_file_size = max_file_size_mb * 1024 * 1024
def sanitize_pdf(self, pdf_path: str) -> SanitizationResult:
findings = []
# Step 1: Scan for hidden content
findings.extend(scan_pdf_hidden_text(pdf_path))
findings.extend(scan_pdf_metadata(pdf_path, INJECTION_PATTERNS))
findings.extend(extract_and_scan_pdf_images(pdf_path))
# Block if critical findings
critical = [f for f in findings if f.get("risk") == "critical"]
if critical:
return SanitizationResult(
original_path=pdf_path,
sanitized_path=pdf_path,
findings=findings,
blocked=True,
block_reason=f"{len(critical)} critical injection patterns detected",
)
# Step 2: Re-render PDF to flatten hidden layers
sanitized_path = self._flatten_pdf(pdf_path)
# Step 3: Strip metadata
self._strip_metadata(sanitized_path)
return SanitizationResult(
original_path=pdf_path,
sanitized_path=sanitized_path,
findings=findings,
blocked=False,
)
def _flatten_pdf(self, pdf_path: str) -> str:
"""Re-render PDF to remove hidden layers and off-page content."""
doc = fitz.open(pdf_path)
output_path = tempfile.mktemp(suffix=".pdf")
new_doc = fitz.open()
for page in doc:
# Render page to image and create new PDF page from it
pix = page.get_pixmap(dpi=150)
new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height)
new_page.insert_image(new_page.rect, pixmap=pix)
new_doc.save(output_path)
return output_path
def _strip_metadata(self, pdf_path: str):
"""Remove all metadata from PDF."""
doc = fitz.open(pdf_path)
doc.set_metadata({})
doc.saveIncr()The most reliable defense: extract all text content first, run it through your text guardrails, then pass it to the LLM. This catches injection payloads regardless of which format they were hidden in.
class PreExtractionGuardrail:
"""Extract text from documents, scan it, then decide whether to ingest."""
def __init__(self, text_guardrail, sanitizer: DocumentSanitizer):
self.text_guardrail = text_guardrail
self.sanitizer = sanitizer
def process_document(self, file_path: str, file_type: str) -> dict:
# Step 1: Sanitize the document format
sanitization = self.sanitizer.sanitize_pdf(file_path)
if sanitization.blocked:
return {
"status": "blocked",
"reason": sanitization.block_reason,
"findings": sanitization.findings,
}
# Step 2: Extract all text content
extracted_text = self._extract_text(sanitization.sanitized_path, file_type)
# Step 3: Run text through guardrails (same ones used for user input)
guard_result = self.text_guardrail.check(extracted_text)
if not guard_result.passed:
return {
"status": "blocked",
"reason": f"Text guardrail triggered: {guard_result.triggered_rules}",
"findings": sanitization.findings,
}
# Step 4: Safe to ingest
return {
"status": "accepted",
"text": guard_result.output, # may be redacted/filtered
"findings": sanitization.findings,
}
def _extract_text(self, file_path: str, file_type: str) -> str:
if file_type == "pdf":
doc = fitz.open(file_path)
return "\n".join(page.get_text() for page in doc)
elif file_type in ("xlsx", "xls"):
return self._extract_spreadsheet_text(file_path)
elif file_type in ("docx",):
return self._extract_docx_text(file_path)
else:
raise ValueError(f"Unsupported file type: {file_type}")Not all documents deserve the same level of trust. Internal documentation from your CMS is different from a PDF a customer uploaded.
@dataclass
class TrustScore:
source: str
score: float # 0.0 = untrusted, 1.0 = fully trusted
requires_review: bool
ingestion_policy: str # "auto", "review", "block"
class DocumentTrustScorer:
"""Assigns trust scores to documents based on their source."""
TRUST_LEVELS = {
"internal_cms": TrustScore("internal_cms", 0.9, False, "auto"),
"verified_partner": TrustScore("verified_partner", 0.7, False, "auto"),
"authenticated_user": TrustScore("authenticated_user", 0.4, True, "review"),
"anonymous_upload": TrustScore("anonymous_upload", 0.1, True, "review"),
"web_scrape": TrustScore("web_scrape", 0.2, True, "review"),
}
def score(self, source_type: str, metadata: dict) -> TrustScore:
base = self.TRUST_LEVELS.get(source_type, self.TRUST_LEVELS["anonymous_upload"])
# Adjust based on metadata
if metadata.get("has_findings", False):
base.score *= 0.5
base.requires_review = True
base.ingestion_policy = "review"
return baseDocuments from untrusted sources get deeper sanitization, stricter guardrail thresholds, and mandatory human review before entering your knowledge base.
Document ingestion is the front door of your RAG system. Every vulnerability that survives the ingestion pipeline becomes a permanent resident of your knowledge base—retrieved again and again, potentially for months before anyone notices.