Semantic Search for Code: Finding Functions, Classes, and Documentation
Build a semantic code search engine that finds relevant functions and classes by intent rather than identifier names, using code-specific embeddings from CodeBERT and AST-aware parsing to understand code structure.
Why Code Search Needs Semantics
Standard text search tools like grep or IDE find-in-files match literal strings. When you search for "validate email address," grep will only find functions that contain those exact words. But your codebase might have a function called check_email_format or is_valid_email that does exactly what you need. Semantic code search bridges this gap by understanding the intent behind code, matching natural language queries to code by meaning.
Extracting Code Units with AST Parsing
Before embedding code, we need to extract meaningful units — functions, classes, and their docstrings — using Abstract Syntax Tree (AST) parsing.
import ast
from dataclasses import dataclass
from typing import List, Optional
from pathlib import Path
@dataclass
class CodeUnit:
name: str
type: str # "function", "class", "method"
docstring: Optional[str]
signature: str
body: str
file_path: str
line_number: int
@property
def search_text(self) -> str:
"""Combine all textual signals for embedding."""
parts = [self.name.replace("_", " ")]
if self.docstring:
parts.append(self.docstring)
parts.append(self.signature)
return " . ".join(parts)
class PythonCodeParser:
def parse_file(self, file_path: str) -> List[CodeUnit]:
"""Extract functions and classes from a Python file."""
source = Path(file_path).read_text()
tree = ast.parse(source, filename=file_path)
units = []
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
units.append(self._extract_function(node, file_path))
elif isinstance(node, ast.ClassDef):
units.append(self._extract_class(node, file_path))
for item in node.body:
if isinstance(item, ast.FunctionDef):
method = self._extract_function(item, file_path)
method.type = "method"
method.name = f"{node.name}.{item.name}"
units.append(method)
return units
def _extract_function(
self, node: ast.FunctionDef, file_path: str
) -> CodeUnit:
args = [arg.arg for arg in node.args.args if arg.arg != "self"]
signature = f"def {node.name}({', '.join(args)})"
body = ast.get_source_segment(
Path(file_path).read_text(), node
) or ""
return CodeUnit(
name=node.name,
type="function",
docstring=ast.get_docstring(node),
signature=signature,
body=body[:500],
file_path=file_path,
line_number=node.lineno,
)
def _extract_class(
self, node: ast.ClassDef, file_path: str
) -> CodeUnit:
bases = [
b.id if isinstance(b, ast.Name) else "..." for b in node.bases
]
signature = f"class {node.name}({', '.join(bases)})" if bases else f"class {node.name}"
return CodeUnit(
name=node.name,
type="class",
docstring=ast.get_docstring(node),
signature=signature,
body="",
file_path=file_path,
line_number=node.lineno,
)
def parse_directory(self, directory: str) -> List[CodeUnit]:
"""Recursively parse all Python files in a directory."""
units = []
for py_file in Path(directory).rglob("*.py"):
try:
units.extend(self.parse_file(str(py_file)))
except SyntaxError:
continue
return units
Code-Specific Embedding Models
General-purpose text models work reasonably for code search, but code-specific models like CodeBERT or UniXcoder understand programming concepts better.
from sentence_transformers import SentenceTransformer
import numpy as np
class CodeSearchEngine:
def __init__(self):
# UniXcoder handles both natural language and code well
self.model = SentenceTransformer(
"microsoft/unixcoder-base"
)
self.parser = PythonCodeParser()
self.code_units: List[CodeUnit] = []
self.embeddings: Optional[np.ndarray] = None
def index_directory(self, directory: str):
"""Parse and embed all code in a directory."""
self.code_units = self.parser.parse_directory(directory)
search_texts = [unit.search_text for unit in self.code_units]
self.embeddings = self.model.encode(
search_texts,
normalize_embeddings=True,
batch_size=32,
show_progress_bar=True,
)
print(f"Indexed {len(self.code_units)} code units")
def search(
self, query: str, top_k: int = 10, type_filter: str = None
) -> List[dict]:
"""Search code using natural language query."""
query_emb = self.model.encode(
[query], normalize_embeddings=True
)
scores = np.dot(self.embeddings, query_emb.T).flatten()
top_indices = np.argsort(scores)[::-1]
results = []
for idx in top_indices:
if len(results) >= top_k:
break
unit = self.code_units[idx]
if type_filter and unit.type != type_filter:
continue
results.append({
"name": unit.name,
"type": unit.type,
"signature": unit.signature,
"docstring": unit.docstring or "No docstring",
"file": unit.file_path,
"line": unit.line_number,
"score": float(scores[idx]),
})
return results
Combining Docstring and Code Body Embeddings
For higher quality results, embed the docstring and the code body separately, then combine their similarity scores.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
class DualEmbeddingCodeSearch:
def __init__(self):
self.nl_model = SentenceTransformer("all-MiniLM-L6-v2")
self.code_model = SentenceTransformer("microsoft/unixcoder-base")
self.code_units: List[CodeUnit] = []
self.doc_embeddings: Optional[np.ndarray] = None
self.code_embeddings: Optional[np.ndarray] = None
def index(self, code_units: List[CodeUnit]):
self.code_units = code_units
doc_texts = [
unit.docstring or unit.name.replace("_", " ")
for unit in code_units
]
self.doc_embeddings = self.nl_model.encode(
doc_texts, normalize_embeddings=True
)
code_texts = [unit.body[:300] or unit.signature for unit in code_units]
self.code_embeddings = self.code_model.encode(
code_texts, normalize_embeddings=True
)
def search(
self,
query: str,
top_k: int = 10,
doc_weight: float = 0.6,
code_weight: float = 0.4,
) -> List[dict]:
"""Hybrid search using both docstring and code embeddings."""
nl_query = self.nl_model.encode(
[query], normalize_embeddings=True
)
code_query = self.code_model.encode(
[query], normalize_embeddings=True
)
doc_scores = np.dot(self.doc_embeddings, nl_query.T).flatten()
code_scores = np.dot(self.code_embeddings, code_query.T).flatten()
combined = doc_weight * doc_scores + code_weight * code_scores
top_indices = np.argsort(combined)[::-1][:top_k]
return [
{
"name": self.code_units[i].name,
"score": float(combined[i]),
"doc_score": float(doc_scores[i]),
"code_score": float(code_scores[i]),
"file": self.code_units[i].file_path,
"line": self.code_units[i].line_number,
}
for i in top_indices
]
FAQ
Should I use CodeBERT, UniXcoder, or a general-purpose model for code search?
UniXcoder generally provides the best results for code search because it was pre-trained on both natural language and six programming languages with a unified cross-modal architecture. CodeBERT is a strong alternative. General-purpose models like all-MiniLM-L6-v2 work surprisingly well for docstring matching but struggle with raw code bodies. If your queries are natural language descriptions, a general model with docstring embeddings is often sufficient.
How do I handle code that has no docstrings?
For undocumented code, construct a synthetic description from the function name (split on underscores and camelCase), parameter names, and return type annotations. For example, def calculate_monthly_payment(principal, rate, term) yields "calculate monthly payment with parameters principal, rate, term." This synthetic description is usually enough for basic semantic matching.
Can this approach work for languages other than Python?
Yes. The AST parsing layer needs to be language-specific — use tree-sitter for a universal parser that supports 40+ languages. The embedding and search layers remain identical. Tree-sitter provides consistent node types across languages, so you can extract functions, classes, and docstrings from JavaScript, Go, Rust, or Java with the same pipeline structure.
#CodeSearch #CodeBERT #ASTParsing #SemanticSearch #DeveloperTools #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.