Learn how to implement intelligent caching strategies that can reduce your AI costs by 10-20% through response reuse and semantic similarity matching.
Smart caching is one of the most effective cost optimization techniques for AI applications. By intelligently storing and reusing responses, you can achieve 10-20% cost reductions while improving response times and reducing API rate limiting issues.
Enterprise Case Study: A SaaS company reduced their monthly OpenAI costs from $15,000 to $12,200 (18.6% savings) by implementing semantic caching for their customer support chatbot.
The simplest approach: cache responses for identical prompts. Best for FAQ systems and repeated queries.
import hashlib
import redis
import json
class ExactMatchCache:
def __init__(self, redis_client):
self.redis = redis_client
self.ttl = 86400 # 24 hours
def get_cache_key(self, prompt: str, model: str) -> str:
content = f"{model}:{prompt}"
return f"ai_cache:{hashlib.md5(content.encode()).hexdigest()}"
def get(self, prompt: str, model: str):
key = self.get_cache_key(prompt, model)
cached = self.redis.get(key)
if cached:
return json.loads(cached)
return None
def set(self, prompt: str, model: str, response: dict):
key = self.get_cache_key(prompt, model)
self.redis.setex(key, self.ttl, json.dumps(response))
# Usage
cache = ExactMatchCache(redis.Redis())
response = cache.get(prompt, "gpt-4")
if not response:
response = openai_client.chat.completions.create(...)
cache.set(prompt, "gpt-4", response)Expected Savings: 5-8% for applications with repeated queries
More sophisticated approach using embeddings to find similar prompts. Higher implementation complexity but better cache hit rates.
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import openai
class SemanticCache:
def __init__(self, similarity_threshold=0.95):
self.cache = {}
self.embeddings = {}
self.threshold = similarity_threshold
def get_embedding(self, text: str):
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=text
)
return response['data'][0]['embedding']
def find_similar(self, prompt: str):
if not self.embeddings:
return None
query_embedding = self.get_embedding(prompt)
for cached_prompt, cached_embedding in self.embeddings.items():
similarity = cosine_similarity(
[query_embedding],
[cached_embedding]
)[0][0]
if similarity >= self.threshold:
return self.cache[cached_prompt]
return None
def store(self, prompt: str, response: dict):
embedding = self.get_embedding(prompt)
self.embeddings[prompt] = embedding
self.cache[prompt] = response
# Usage with similarity matching
semantic_cache = SemanticCache(similarity_threshold=0.95)
similar_response = semantic_cache.find_similar(new_prompt)
if similar_response:
# Use cached response (costs $0)
return similar_response
else:
# Make new API call and cache
response = openai_client.chat.completions.create(...)
semantic_cache.store(new_prompt, response)Expected Savings: 12-20% for applications with similar but not identical queries
Implement smart TTL policies based on content type and freshness requirements.
Get monitoring and optimization tools to track your caching implementation success and maximize cost savings.
Get Optimization Tools