Building a Cost Optimization Agent for Cloud Infrastructure: AWS, GCP, and Azure

The Cloud Cost Problem

Most organizations overspend on cloud infrastructure by 30-40%. The waste comes from oversized instances, unused resources, missing reserved instance coverage, and zombie infrastructure that nobody remembers deploying. An AI cost optimization agent continuously scans your cloud accounts, identifies waste, and generates actionable recommendations with projected savings.

Multi-Cloud Resource Discovery

The agent needs a unified view of resources across all cloud providers. A provider abstraction layer handles the differences.

from dataclasses import dataclass
from abc import ABC, abstractmethod
from typing import Optional

@dataclass
class CloudResource:
    provider: str  # "aws", "gcp", "azure"
    resource_type: str  # "compute", "database", "storage", "network"
    resource_id: str
    name: str
    region: str
    instance_type: Optional[str] = None
    monthly_cost: float = 0.0
    avg_cpu_percent: float = 0.0
    avg_memory_percent: float = 0.0
    last_accessed: Optional[str] = None
    tags: dict = None

    def __post_init__(self):
        if self.tags is None:
            self.tags = {}

class CloudProvider(ABC):
    @abstractmethod
    async def list_compute_resources(self) -> list[CloudResource]:
        pass

    @abstractmethod
    async def get_utilization(self, resource_id: str, days: int) -> dict:
        pass

class AWSProvider(CloudProvider):
    def __init__(self):
        import boto3
        self.ec2 = boto3.client("ec2")
        self.cloudwatch = boto3.client("cloudwatch")
        self.ce = boto3.client("ce")  # Cost Explorer

    async def list_compute_resources(self) -> list[CloudResource]:
        instances = self.ec2.describe_instances()
        resources = []
        for reservation in instances["Reservations"]:
            for inst in reservation["Instances"]:
                if inst["State"]["Name"] != "running":
                    continue
                tags = {t["Key"]: t["Value"] for t in inst.get("Tags", [])}
                resources.append(CloudResource(
                    provider="aws",
                    resource_type="compute",
                    resource_id=inst["InstanceId"],
                    name=tags.get("Name", inst["InstanceId"]),
                    region=inst["Placement"]["AvailabilityZone"][:-1],
                    instance_type=inst["InstanceType"],
                    tags=tags,
                ))
        return resources

    async def get_utilization(self, resource_id: str, days: int) -> dict:
        from datetime import datetime, timedelta
        end = datetime.utcnow()
        start = end - timedelta(days=days)
        response = self.cloudwatch.get_metric_statistics(
            Namespace="AWS/EC2",
            MetricName="CPUUtilization",
            Dimensions=[{"Name": "InstanceId", "Value": resource_id}],
            StartTime=start, EndTime=end,
            Period=3600, Statistics=["Average"],
        )
        points = [p["Average"] for p in response["Datapoints"]]
        return {
            "avg_cpu": sum(points) / len(points) if points else 0,
            "max_cpu": max(points) if points else 0,
            "data_points": len(points),
        }

Waste Detection Engine

The agent identifies five categories of waste: idle resources, oversized instances, unattached volumes, old snapshots, and unused load balancers.

from enum import Enum

class WasteCategory(Enum):
    IDLE = "idle"
    OVERSIZED = "oversized"
    UNATTACHED = "unattached"
    OLD_SNAPSHOT = "old_snapshot"
    UNUSED_LB = "unused_load_balancer"

@dataclass
class WasteFinding:
    resource: CloudResource
    category: WasteCategory
    monthly_waste: float
    confidence: float  # 0.0-1.0
    recommendation: str
    risk_level: str  # "safe", "review", "caution"

class WasteDetector:
    def __init__(self, idle_cpu_threshold: float = 5.0, oversize_cpu_max: float = 20.0):
        self.idle_cpu_threshold = idle_cpu_threshold
        self.oversize_cpu_max = oversize_cpu_max

    async def detect_idle_resources(
        self, resources: list[CloudResource], provider: CloudProvider
    ) -> list[WasteFinding]:
        findings = []
        for resource in resources:
            if resource.resource_type != "compute":
                continue

            util = await provider.get_utilization(resource.resource_id, days=14)
            if util["avg_cpu"] < self.idle_cpu_threshold:
                findings.append(WasteFinding(
                    resource=resource,
                    category=WasteCategory.IDLE,
                    monthly_waste=resource.monthly_cost * 0.9,
                    confidence=0.95 if util["max_cpu"] < 10 else 0.7,
                    recommendation=(
                        f"Instance {resource.name} averages {util['avg_cpu']:.1f}% CPU "
                        f"over 14 days. Consider terminating or downgrading."
                    ),
                    risk_level="safe" if util["max_cpu"] < 10 else "review",
                ))
        return findings

    async def detect_oversized_resources(
        self, resources: list[CloudResource], provider: CloudProvider
    ) -> list[WasteFinding]:
        findings = []
        for resource in resources:
            if resource.resource_type != "compute":
                continue

            util = await provider.get_utilization(resource.resource_id, days=30)
            if util["avg_cpu"] < self.oversize_cpu_max and util["max_cpu"] < 50:
                smaller = self._suggest_smaller_instance(resource.instance_type)
                if smaller:
                    savings = resource.monthly_cost * 0.4
                    findings.append(WasteFinding(
                        resource=resource,
                        category=WasteCategory.OVERSIZED,
                        monthly_waste=savings,
                        confidence=0.8,
                        recommendation=(
                            f"Downsize {resource.instance_type} to {smaller}. "
                            f"Avg CPU: {util['avg_cpu']:.1f}%, Max: {util['max_cpu']:.1f}%."
                        ),
                        risk_level="review",
                    ))
        return findings

    def _suggest_smaller_instance(self, current: str) -> Optional[str]:
        downsize_map = {
            "m5.xlarge": "m5.large",
            "m5.2xlarge": "m5.xlarge",
            "m5.4xlarge": "m5.2xlarge",
            "c5.xlarge": "c5.large",
            "r5.xlarge": "r5.large",
        }
        return downsize_map.get(current)

Reserved Instance Recommendation Engine

The agent analyzes usage patterns to recommend reserved instance purchases.

See AI Voice Agents Handle Real Calls

Book a free demo or calculate how much you can save with AI voice automation.

Book a Demo ROI Calculator

async def recommend_reserved_instances(
    resources: list[CloudResource],
    provider: CloudProvider,
) -> list[dict]:
    recommendations = []
    instance_type_groups: dict[str, list[CloudResource]] = {}

    for r in resources:
        if r.instance_type:
            instance_type_groups.setdefault(r.instance_type, []).append(r)

    for itype, group in instance_type_groups.items():
        if len(group) < 2:
            continue  # not enough to justify RI

        stable_count = 0
        for r in group:
            util = await provider.get_utilization(r.resource_id, days=60)
            if util["avg_cpu"] > 10:
                stable_count += 1

        if stable_count >= 2:
            on_demand_monthly = stable_count * group[0].monthly_cost
            ri_monthly = on_demand_monthly * 0.6  # ~40% savings typical
            recommendations.append({
                "instance_type": itype,
                "count": stable_count,
                "term": "1-year",
                "on_demand_monthly": on_demand_monthly,
                "ri_monthly": ri_monthly,
                "monthly_savings": on_demand_monthly - ri_monthly,
                "annual_savings": (on_demand_monthly - ri_monthly) * 12,
            })

    return sorted(recommendations, key=lambda r: r["annual_savings"], reverse=True)

Generating the Cost Report

import openai

async def generate_cost_report(
    findings: list[WasteFinding],
    ri_recommendations: list[dict],
) -> str:
    total_waste = sum(f.monthly_waste for f in findings)
    total_ri_savings = sum(r["monthly_savings"] for r in ri_recommendations)

    client = openai.AsyncOpenAI()
    response = await client.chat.completions.create(
        model="gpt-4o",
        messages=[{
            "role": "user",
            "content": f"""Generate an executive cost optimization report.

Total monthly waste identified: ${total_waste:,.2f}
Total RI savings opportunity: ${total_ri_savings:,.2f}
Waste findings: {len(findings)}
Top waste categories: {[f.category.value for f in findings[:10]]}
Top RI recommendations: {ri_recommendations[:5]}

Format as a concise executive summary with: headline savings number,
top 5 quick wins, RI purchase plan, and risk assessment."""
        }],
    )
    return response.choices[0].message.content

FAQ

How do I handle resources that have low average CPU but periodic spikes?

Look at the max CPU and p95 CPU over the analysis period, not just the average. A resource with 5% average but 90% spikes during business hours is not idle. The agent should classify these as "bursty" and recommend burstable instance types (like AWS t3 instances) rather than termination.

Should the agent automatically terminate or resize resources?

Never for production resources. The agent should generate recommendations with confidence scores and estimated savings. Provide a one-click approval workflow where a human reviews and approves each change. For development and staging environments, you can enable auto-remediation with a 24-hour grace period and Slack notification.

How do I track savings over time to prove ROI?

Tag each optimization action with a unique ID. Track the resource cost before and after the change. The agent maintains a savings ledger that compares actual monthly spend against the projected spend if no changes were made. Report the cumulative savings monthly to demonstrate ROI.

#CloudCostOptimization #AWS #GCP #Azure #FinOps #Python #AgenticAI #LearnAI #AIEngineering

Building a Cost Optimization Agent for Cloud Infrastructure: AWS, GCP, and Azure

The Cloud Cost Problem

Multi-Cloud Resource Discovery

Waste Detection Engine

Reserved Instance Recommendation Engine

Generating the Cost Report

FAQ

How do I handle resources that have low average CPU but periodic spikes?

Should the agent automatically terminate or resize resources?

How do I track savings over time to prove ROI?

Try CallSphere AI Voice Agents

Related Articles

WebArena and Real-World Web Agent Benchmarks: How We Measure Browser Agent Performance

Taking Screenshots and Recording Videos with Playwright for AI Analysis

Playwright Selectors Deep Dive: CSS, XPath, Text, and Role-Based Element Finding