Semantic Search for Code: Finding Functions, Classes, and Documentation

Why Code Search Needs Semantics

Standard text search tools like grep or IDE find-in-files match literal strings. When you search for "validate email address," grep will only find functions that contain those exact words. But your codebase might have a function called check_email_format or is_valid_email that does exactly what you need. Semantic code search bridges this gap by understanding the intent behind code, matching natural language queries to code by meaning.

Extracting Code Units with AST Parsing

Before embedding code, we need to extract meaningful units — functions, classes, and their docstrings — using Abstract Syntax Tree (AST) parsing.

import ast
from dataclasses import dataclass
from typing import List, Optional
from pathlib import Path

@dataclass
class CodeUnit:
    name: str
    type: str  # "function", "class", "method"
    docstring: Optional[str]
    signature: str
    body: str
    file_path: str
    line_number: int

    @property
    def search_text(self) -> str:
        """Combine all textual signals for embedding."""
        parts = [self.name.replace("_", " ")]
        if self.docstring:
            parts.append(self.docstring)
        parts.append(self.signature)
        return " . ".join(parts)


class PythonCodeParser:
    def parse_file(self, file_path: str) -> List[CodeUnit]:
        """Extract functions and classes from a Python file."""
        source = Path(file_path).read_text()
        tree = ast.parse(source, filename=file_path)
        units = []

        for node in ast.walk(tree):
            if isinstance(node, ast.FunctionDef):
                units.append(self._extract_function(node, file_path))
            elif isinstance(node, ast.ClassDef):
                units.append(self._extract_class(node, file_path))
                for item in node.body:
                    if isinstance(item, ast.FunctionDef):
                        method = self._extract_function(item, file_path)
                        method.type = "method"
                        method.name = f"{node.name}.{item.name}"
                        units.append(method)

        return units

    def _extract_function(
        self, node: ast.FunctionDef, file_path: str
    ) -> CodeUnit:
        args = [arg.arg for arg in node.args.args if arg.arg != "self"]
        signature = f"def {node.name}({', '.join(args)})"
        body = ast.get_source_segment(
            Path(file_path).read_text(), node
        ) or ""

        return CodeUnit(
            name=node.name,
            type="function",
            docstring=ast.get_docstring(node),
            signature=signature,
            body=body[:500],
            file_path=file_path,
            line_number=node.lineno,
        )

    def _extract_class(
        self, node: ast.ClassDef, file_path: str
    ) -> CodeUnit:
        bases = [
            b.id if isinstance(b, ast.Name) else "..." for b in node.bases
        ]
        signature = f"class {node.name}({', '.join(bases)})" if bases else f"class {node.name}"

        return CodeUnit(
            name=node.name,
            type="class",
            docstring=ast.get_docstring(node),
            signature=signature,
            body="",
            file_path=file_path,
            line_number=node.lineno,
        )

    def parse_directory(self, directory: str) -> List[CodeUnit]:
        """Recursively parse all Python files in a directory."""
        units = []
        for py_file in Path(directory).rglob("*.py"):
            try:
                units.extend(self.parse_file(str(py_file)))
            except SyntaxError:
                continue
        return units

Code-Specific Embedding Models

General-purpose text models work reasonably for code search, but code-specific models like CodeBERT or UniXcoder understand programming concepts better.

from sentence_transformers import SentenceTransformer
import numpy as np

class CodeSearchEngine:
    def __init__(self):
        # UniXcoder handles both natural language and code well
        self.model = SentenceTransformer(
            "microsoft/unixcoder-base"
        )
        self.parser = PythonCodeParser()
        self.code_units: List[CodeUnit] = []
        self.embeddings: Optional[np.ndarray] = None

    def index_directory(self, directory: str):
        """Parse and embed all code in a directory."""
        self.code_units = self.parser.parse_directory(directory)

        search_texts = [unit.search_text for unit in self.code_units]
        self.embeddings = self.model.encode(
            search_texts,
            normalize_embeddings=True,
            batch_size=32,
            show_progress_bar=True,
        )
        print(f"Indexed {len(self.code_units)} code units")

    def search(
        self, query: str, top_k: int = 10, type_filter: str = None
    ) -> List[dict]:
        """Search code using natural language query."""
        query_emb = self.model.encode(
            [query], normalize_embeddings=True
        )
        scores = np.dot(self.embeddings, query_emb.T).flatten()
        top_indices = np.argsort(scores)[::-1]

        results = []
        for idx in top_indices:
            if len(results) >= top_k:
                break
            unit = self.code_units[idx]
            if type_filter and unit.type != type_filter:
                continue
            results.append({
                "name": unit.name,
                "type": unit.type,
                "signature": unit.signature,
                "docstring": unit.docstring or "No docstring",
                "file": unit.file_path,
                "line": unit.line_number,
                "score": float(scores[idx]),
            })
        return results

Combining Docstring and Code Body Embeddings

For higher quality results, embed the docstring and the code body separately, then combine their similarity scores.

See AI Voice Agents Handle Real Calls

Book a free demo or calculate how much you can save with AI voice automation.

Book a Demo ROI Calculator

class DualEmbeddingCodeSearch:
    def __init__(self):
        self.nl_model = SentenceTransformer("all-MiniLM-L6-v2")
        self.code_model = SentenceTransformer("microsoft/unixcoder-base")
        self.code_units: List[CodeUnit] = []
        self.doc_embeddings: Optional[np.ndarray] = None
        self.code_embeddings: Optional[np.ndarray] = None

    def index(self, code_units: List[CodeUnit]):
        self.code_units = code_units

        doc_texts = [
            unit.docstring or unit.name.replace("_", " ")
            for unit in code_units
        ]
        self.doc_embeddings = self.nl_model.encode(
            doc_texts, normalize_embeddings=True
        )

        code_texts = [unit.body[:300] or unit.signature for unit in code_units]
        self.code_embeddings = self.code_model.encode(
            code_texts, normalize_embeddings=True
        )

    def search(
        self,
        query: str,
        top_k: int = 10,
        doc_weight: float = 0.6,
        code_weight: float = 0.4,
    ) -> List[dict]:
        """Hybrid search using both docstring and code embeddings."""
        nl_query = self.nl_model.encode(
            [query], normalize_embeddings=True
        )
        code_query = self.code_model.encode(
            [query], normalize_embeddings=True
        )

        doc_scores = np.dot(self.doc_embeddings, nl_query.T).flatten()
        code_scores = np.dot(self.code_embeddings, code_query.T).flatten()

        combined = doc_weight * doc_scores + code_weight * code_scores
        top_indices = np.argsort(combined)[::-1][:top_k]

        return [
            {
                "name": self.code_units[i].name,
                "score": float(combined[i]),
                "doc_score": float(doc_scores[i]),
                "code_score": float(code_scores[i]),
                "file": self.code_units[i].file_path,
                "line": self.code_units[i].line_number,
            }
            for i in top_indices
        ]

FAQ

Should I use CodeBERT, UniXcoder, or a general-purpose model for code search?

UniXcoder generally provides the best results for code search because it was pre-trained on both natural language and six programming languages with a unified cross-modal architecture. CodeBERT is a strong alternative. General-purpose models like all-MiniLM-L6-v2 work surprisingly well for docstring matching but struggle with raw code bodies. If your queries are natural language descriptions, a general model with docstring embeddings is often sufficient.

How do I handle code that has no docstrings?

For undocumented code, construct a synthetic description from the function name (split on underscores and camelCase), parameter names, and return type annotations. For example, def calculate_monthly_payment(principal, rate, term) yields "calculate monthly payment with parameters principal, rate, term." This synthetic description is usually enough for basic semantic matching.

Can this approach work for languages other than Python?

Yes. The AST parsing layer needs to be language-specific — use tree-sitter for a universal parser that supports 40+ languages. The embedding and search layers remain identical. Tree-sitter provides consistent node types across languages, so you can extract functions, classes, and docstrings from JavaScript, Go, Rust, or Java with the same pipeline structure.

#CodeSearch #CodeBERT #ASTParsing #SemanticSearch #DeveloperTools #AgenticAI #LearnAI #AIEngineering

Semantic Search for Code: Finding Functions, Classes, and Documentation

Why Code Search Needs Semantics

Extracting Code Units with AST Parsing

Code-Specific Embedding Models

Combining Docstring and Code Body Embeddings

FAQ

Should I use CodeBERT, UniXcoder, or a general-purpose model for code search?

How do I handle code that has no docstrings?

Can this approach work for languages other than Python?

Try CallSphere AI Voice Agents

Related Articles

WebArena and Real-World Web Agent Benchmarks: How We Measure Browser Agent Performance

Taking Screenshots and Recording Videos with Playwright for AI Analysis

Playwright Selectors Deep Dive: CSS, XPath, Text, and Role-Based Element Finding