Building an AI Documentation Assistant with RAG
A complete guide to building a production-grade AI documentation assistant using Retrieval-Augmented Generation, covering chunking strategies, embedding models, vector stores, and answer synthesis.
Why Documentation Needs AI
Technical documentation is one of the most universally frustrating aspects of software development. Teams write docs that become stale, users cannot find what they need, and search returns pages of irrelevant results. An AI documentation assistant powered by RAG (Retrieval-Augmented Generation) solves these problems by understanding natural language questions and synthesizing answers from your actual documentation.
Unlike a pure LLM approach, RAG grounds the AI's responses in your specific documentation, dramatically reducing hallucination and ensuring answers are accurate and up to date.
RAG Architecture for Documentation
A documentation RAG system has four main stages:
[User Question] -> [Embed Query] -> [Vector Search] -> [Retrieve Chunks] -> [LLM Synthesis] -> [Answer]
|
[Document Corpus] --+-- [Chunked & Embedded at Ingestion]
Stage 1: Document Ingestion and Chunking
The quality of your RAG system depends heavily on how you chunk your documents. Poor chunking leads to irrelevant retrieval, which leads to poor answers.
from dataclasses import dataclass
from pathlib import Path
import re
@dataclass
class DocumentChunk:
content: str
metadata: dict # source file, section, page number, etc.
chunk_id: str
class DocumentChunker:
"""Chunk documents using semantic boundaries."""
def __init__(self, max_chunk_size: int = 1000, overlap: int = 200):
self.max_chunk_size = max_chunk_size # in tokens
self.overlap = overlap
def chunk_markdown(self, content: str, source: str) -> list[DocumentChunk]:
"""Chunk markdown documents by headers, preserving semantic boundaries."""
chunks = []
sections = self._split_by_headers(content)
for section in sections:
section_title = section["title"]
section_text = section["content"]
if self._count_tokens(section_text) <= self.max_chunk_size:
chunks.append(DocumentChunk(
content=f"# {section_title}\n\n{section_text}",
metadata={
"source": source,
"section": section_title,
"type": "documentation",
},
chunk_id=f"{source}::{section_title}"
))
else:
# Section too large -- split by paragraphs with overlap
sub_chunks = self._split_with_overlap(section_text, section_title)
for i, sub_chunk in enumerate(sub_chunks):
chunks.append(DocumentChunk(
content=f"# {section_title} (part {i+1})\n\n{sub_chunk}",
metadata={
"source": source,
"section": section_title,
"part": i + 1,
"type": "documentation",
},
chunk_id=f"{source}::{section_title}::part{i+1}"
))
return chunks
def _split_by_headers(self, content: str) -> list[dict]:
"""Split markdown content by H1 and H2 headers."""
pattern = r'^(#{1,2})\s+(.+)$'
sections = []
current_title = "Introduction"
current_content = []
for line in content.split("\n"):
match = re.match(pattern, line)
if match:
if current_content:
sections.append({
"title": current_title,
"content": "\n".join(current_content).strip()
})
current_title = match.group(2)
current_content = []
else:
current_content.append(line)
if current_content:
sections.append({
"title": current_title,
"content": "\n".join(current_content).strip()
})
return sections
Chunking Strategy Comparison
| Strategy | Pros | Cons | Best For |
|---|---|---|---|
| Fixed-size | Simple, predictable | Breaks mid-sentence | Uniform content |
| Header-based | Preserves semantic units | Sections vary wildly in size | Markdown/HTML docs |
| Paragraph-based | Natural boundaries | Paragraphs can be too small | Prose-heavy docs |
| Recursive | Adapts to content structure | More complex to implement | Mixed content types |
| Semantic | Best retrieval quality | Requires embedding model | High-value corpuses |
Stage 2: Embedding and Indexing
Generate embeddings for each chunk and store them in a vector database:
import voyageai
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
# Initialize clients
voyage = voyageai.Client()
qdrant = QdrantClient(url="http://localhost:6333")
COLLECTION_NAME = "documentation"
EMBEDDING_MODEL = "voyage-3"
EMBEDDING_DIM = 1024
async def create_collection():
"""Create the vector collection with appropriate settings."""
qdrant.create_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(
size=EMBEDDING_DIM,
distance=Distance.COSINE,
),
)
async def index_chunks(chunks: list[DocumentChunk]):
"""Embed and index document chunks."""
# Batch embedding for efficiency
texts = [chunk.content for chunk in chunks]
embeddings = voyage.embed(texts, model=EMBEDDING_MODEL, input_type="document").embeddings
points = [
PointStruct(
id=i,
vector=embedding,
payload={
"content": chunk.content,
"source": chunk.metadata["source"],
"section": chunk.metadata.get("section", ""),
"chunk_id": chunk.chunk_id,
},
)
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings))
]
qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
Stage 3: Retrieval
When a user asks a question, embed their query and search for the most relevant chunks:
async def retrieve_context(query: str, top_k: int = 5) -> list[dict]:
"""Retrieve the most relevant documentation chunks for a query."""
# Embed the query
query_embedding = voyage.embed(
[query], model=EMBEDDING_MODEL, input_type="query"
).embeddings[0]
# Search vector store
results = qdrant.search(
collection_name=COLLECTION_NAME,
query_vector=query_embedding,
limit=top_k,
score_threshold=0.7, # Filter out low-relevance results
)
return [
{
"content": result.payload["content"],
"source": result.payload["source"],
"section": result.payload["section"],
"score": result.score,
}
for result in results
]
Stage 4: Answer Synthesis
Combine the retrieved context with the user's question and generate an answer:
import anthropic
client = anthropic.Anthropic()
SYSTEM_PROMPT = """You are a documentation assistant. Answer questions based ONLY on the provided documentation context. Follow these rules:
1. If the answer is in the documentation, provide it with the source reference
2. If the answer is NOT in the documentation, say "I could not find this in the documentation"
3. Never make up information that is not in the provided context
4. Include code examples from the documentation when relevant
5. Cite the source document for each piece of information"""
async def answer_question(query: str) -> dict:
"""Answer a documentation question using RAG."""
# Retrieve relevant context
context_chunks = await retrieve_context(query, top_k=5)
if not context_chunks:
return {
"answer": "I could not find relevant documentation for your question.",
"sources": [],
}
# Format context for the LLM
context = "\n\n---\n\n".join(
f"Source: {c['source']} > {c['section']}\n{c['content']}"
for c in context_chunks
)
# Generate answer
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2048,
system=SYSTEM_PROMPT,
messages=[{
"role": "user",
"content": f"Documentation context:\n{context}\n\nQuestion: {query}"
}]
)
return {
"answer": response.content[0].text,
"sources": [
{"file": c["source"], "section": c["section"], "relevance": c["score"]}
for c in context_chunks
],
"usage": {
"input_tokens": response.usage.input_tokens,
"output_tokens": response.usage.output_tokens,
},
}
Advanced RAG Techniques
Hybrid Search
Combine vector search with keyword search (BM25) for better retrieval:
async def hybrid_retrieve(query: str, top_k: int = 5, alpha: float = 0.7) -> list[dict]:
"""Combine vector and keyword search with weighted scoring."""
# Vector search
vector_results = await vector_search(query, top_k=top_k * 2)
# BM25 keyword search
keyword_results = await bm25_search(query, top_k=top_k * 2)
# Reciprocal Rank Fusion (RRF)
scores = {}
for rank, result in enumerate(vector_results):
chunk_id = result["chunk_id"]
scores[chunk_id] = scores.get(chunk_id, 0) + alpha / (60 + rank)
for rank, result in enumerate(keyword_results):
chunk_id = result["chunk_id"]
scores[chunk_id] = scores.get(chunk_id, 0) + (1 - alpha) / (60 + rank)
# Sort by combined score and return top_k
sorted_ids = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:top_k]
return [get_chunk(chunk_id) for chunk_id in sorted_ids]
Query Rewriting
Improve retrieval by rewriting the user's question before searching:
async def rewrite_query(original_query: str) -> list[str]:
"""Generate multiple search queries from the original question."""
response = client.messages.create(
model="claude-haiku-3-5-20241022",
max_tokens=256,
messages=[{
"role": "user",
"content": f"""Generate 3 alternative search queries for this documentation question.
Return one query per line, no numbering.
Original question: {original_query}"""
}]
)
queries = response.content[0].text.strip().split("\n")
return [original_query] + queries[:3]
Re-ranking
After initial retrieval, use a cross-encoder model to re-rank results for higher precision:
async def rerank_results(query: str, results: list[dict], top_k: int = 5) -> list[dict]:
"""Re-rank retrieved results using a cross-encoder."""
rerank_response = voyage.rerank(
query=query,
documents=[r["content"] for r in results],
model="rerank-2",
top_k=top_k,
)
return [results[r.index] for r in rerank_response.results]
Keeping Documentation Fresh
A documentation assistant is only as good as its index. Implement automated re-indexing:
async def incremental_reindex(changed_files: list[str]):
"""Re-index only changed documentation files."""
for file_path in changed_files:
# Remove old chunks for this file
qdrant.delete(
collection_name=COLLECTION_NAME,
points_selector={"filter": {"must": [
{"key": "source", "match": {"value": file_path}}
]}}
)
# Chunk and re-index
content = Path(file_path).read_text()
chunks = chunker.chunk_markdown(content, source=file_path)
await index_chunks(chunks)
Conclusion
Building an AI documentation assistant with RAG transforms static documentation into an interactive knowledge base. The key decisions -- chunking strategy, embedding model, retrieval method, and synthesis prompt -- each have a measurable impact on answer quality. Start with header-based chunking and simple vector search, measure answer quality with evals, and iterate toward hybrid search and re-ranking as your corpus and user base grow.
NYC News
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.