Building a Cost Optimization Agent for Cloud Infrastructure: AWS, GCP, and Azure
Build an AI agent that analyzes cloud resource usage across AWS, GCP, and Azure, identifies waste, recommends rightsizing, and suggests reserved instance purchases for maximum savings.
The Cloud Cost Problem
Most organizations overspend on cloud infrastructure by 30-40%. The waste comes from oversized instances, unused resources, missing reserved instance coverage, and zombie infrastructure that nobody remembers deploying. An AI cost optimization agent continuously scans your cloud accounts, identifies waste, and generates actionable recommendations with projected savings.
Multi-Cloud Resource Discovery
The agent needs a unified view of resources across all cloud providers. A provider abstraction layer handles the differences.
from dataclasses import dataclass
from abc import ABC, abstractmethod
from typing import Optional
@dataclass
class CloudResource:
provider: str # "aws", "gcp", "azure"
resource_type: str # "compute", "database", "storage", "network"
resource_id: str
name: str
region: str
instance_type: Optional[str] = None
monthly_cost: float = 0.0
avg_cpu_percent: float = 0.0
avg_memory_percent: float = 0.0
last_accessed: Optional[str] = None
tags: dict = None
def __post_init__(self):
if self.tags is None:
self.tags = {}
class CloudProvider(ABC):
@abstractmethod
async def list_compute_resources(self) -> list[CloudResource]:
pass
@abstractmethod
async def get_utilization(self, resource_id: str, days: int) -> dict:
pass
class AWSProvider(CloudProvider):
def __init__(self):
import boto3
self.ec2 = boto3.client("ec2")
self.cloudwatch = boto3.client("cloudwatch")
self.ce = boto3.client("ce") # Cost Explorer
async def list_compute_resources(self) -> list[CloudResource]:
instances = self.ec2.describe_instances()
resources = []
for reservation in instances["Reservations"]:
for inst in reservation["Instances"]:
if inst["State"]["Name"] != "running":
continue
tags = {t["Key"]: t["Value"] for t in inst.get("Tags", [])}
resources.append(CloudResource(
provider="aws",
resource_type="compute",
resource_id=inst["InstanceId"],
name=tags.get("Name", inst["InstanceId"]),
region=inst["Placement"]["AvailabilityZone"][:-1],
instance_type=inst["InstanceType"],
tags=tags,
))
return resources
async def get_utilization(self, resource_id: str, days: int) -> dict:
from datetime import datetime, timedelta
end = datetime.utcnow()
start = end - timedelta(days=days)
response = self.cloudwatch.get_metric_statistics(
Namespace="AWS/EC2",
MetricName="CPUUtilization",
Dimensions=[{"Name": "InstanceId", "Value": resource_id}],
StartTime=start, EndTime=end,
Period=3600, Statistics=["Average"],
)
points = [p["Average"] for p in response["Datapoints"]]
return {
"avg_cpu": sum(points) / len(points) if points else 0,
"max_cpu": max(points) if points else 0,
"data_points": len(points),
}
Waste Detection Engine
The agent identifies five categories of waste: idle resources, oversized instances, unattached volumes, old snapshots, and unused load balancers.
from enum import Enum
class WasteCategory(Enum):
IDLE = "idle"
OVERSIZED = "oversized"
UNATTACHED = "unattached"
OLD_SNAPSHOT = "old_snapshot"
UNUSED_LB = "unused_load_balancer"
@dataclass
class WasteFinding:
resource: CloudResource
category: WasteCategory
monthly_waste: float
confidence: float # 0.0-1.0
recommendation: str
risk_level: str # "safe", "review", "caution"
class WasteDetector:
def __init__(self, idle_cpu_threshold: float = 5.0, oversize_cpu_max: float = 20.0):
self.idle_cpu_threshold = idle_cpu_threshold
self.oversize_cpu_max = oversize_cpu_max
async def detect_idle_resources(
self, resources: list[CloudResource], provider: CloudProvider
) -> list[WasteFinding]:
findings = []
for resource in resources:
if resource.resource_type != "compute":
continue
util = await provider.get_utilization(resource.resource_id, days=14)
if util["avg_cpu"] < self.idle_cpu_threshold:
findings.append(WasteFinding(
resource=resource,
category=WasteCategory.IDLE,
monthly_waste=resource.monthly_cost * 0.9,
confidence=0.95 if util["max_cpu"] < 10 else 0.7,
recommendation=(
f"Instance {resource.name} averages {util['avg_cpu']:.1f}% CPU "
f"over 14 days. Consider terminating or downgrading."
),
risk_level="safe" if util["max_cpu"] < 10 else "review",
))
return findings
async def detect_oversized_resources(
self, resources: list[CloudResource], provider: CloudProvider
) -> list[WasteFinding]:
findings = []
for resource in resources:
if resource.resource_type != "compute":
continue
util = await provider.get_utilization(resource.resource_id, days=30)
if util["avg_cpu"] < self.oversize_cpu_max and util["max_cpu"] < 50:
smaller = self._suggest_smaller_instance(resource.instance_type)
if smaller:
savings = resource.monthly_cost * 0.4
findings.append(WasteFinding(
resource=resource,
category=WasteCategory.OVERSIZED,
monthly_waste=savings,
confidence=0.8,
recommendation=(
f"Downsize {resource.instance_type} to {smaller}. "
f"Avg CPU: {util['avg_cpu']:.1f}%, Max: {util['max_cpu']:.1f}%."
),
risk_level="review",
))
return findings
def _suggest_smaller_instance(self, current: str) -> Optional[str]:
downsize_map = {
"m5.xlarge": "m5.large",
"m5.2xlarge": "m5.xlarge",
"m5.4xlarge": "m5.2xlarge",
"c5.xlarge": "c5.large",
"r5.xlarge": "r5.large",
}
return downsize_map.get(current)
Reserved Instance Recommendation Engine
The agent analyzes usage patterns to recommend reserved instance purchases.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
async def recommend_reserved_instances(
resources: list[CloudResource],
provider: CloudProvider,
) -> list[dict]:
recommendations = []
instance_type_groups: dict[str, list[CloudResource]] = {}
for r in resources:
if r.instance_type:
instance_type_groups.setdefault(r.instance_type, []).append(r)
for itype, group in instance_type_groups.items():
if len(group) < 2:
continue # not enough to justify RI
stable_count = 0
for r in group:
util = await provider.get_utilization(r.resource_id, days=60)
if util["avg_cpu"] > 10:
stable_count += 1
if stable_count >= 2:
on_demand_monthly = stable_count * group[0].monthly_cost
ri_monthly = on_demand_monthly * 0.6 # ~40% savings typical
recommendations.append({
"instance_type": itype,
"count": stable_count,
"term": "1-year",
"on_demand_monthly": on_demand_monthly,
"ri_monthly": ri_monthly,
"monthly_savings": on_demand_monthly - ri_monthly,
"annual_savings": (on_demand_monthly - ri_monthly) * 12,
})
return sorted(recommendations, key=lambda r: r["annual_savings"], reverse=True)
Generating the Cost Report
import openai
async def generate_cost_report(
findings: list[WasteFinding],
ri_recommendations: list[dict],
) -> str:
total_waste = sum(f.monthly_waste for f in findings)
total_ri_savings = sum(r["monthly_savings"] for r in ri_recommendations)
client = openai.AsyncOpenAI()
response = await client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": f"""Generate an executive cost optimization report.
Total monthly waste identified: ${total_waste:,.2f}
Total RI savings opportunity: ${total_ri_savings:,.2f}
Waste findings: {len(findings)}
Top waste categories: {[f.category.value for f in findings[:10]]}
Top RI recommendations: {ri_recommendations[:5]}
Format as a concise executive summary with: headline savings number,
top 5 quick wins, RI purchase plan, and risk assessment."""
}],
)
return response.choices[0].message.content
FAQ
How do I handle resources that have low average CPU but periodic spikes?
Look at the max CPU and p95 CPU over the analysis period, not just the average. A resource with 5% average but 90% spikes during business hours is not idle. The agent should classify these as "bursty" and recommend burstable instance types (like AWS t3 instances) rather than termination.
Should the agent automatically terminate or resize resources?
Never for production resources. The agent should generate recommendations with confidence scores and estimated savings. Provide a one-click approval workflow where a human reviews and approves each change. For development and staging environments, you can enable auto-remediation with a 24-hour grace period and Slack notification.
How do I track savings over time to prove ROI?
Tag each optimization action with a unique ID. Track the resource cost before and after the change. The agent maintains a savings ledger that compares actual monthly spend against the projected spend if no changes were made. Report the cumulative savings monthly to demonstrate ROI.
#CloudCostOptimization #AWS #GCP #Azure #FinOps #Python #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.