Problem: Your AI Agent Forgets Everything
You build a great AI agent. User comes back the next day — the agent has no idea who they are, what they asked, or what was decided. Every session starts from zero.
You'll learn:
- Why LLMs are stateless and what that means for agents
- Three practical memory patterns: buffer, summary, and vector search
- How to persist and retrieve memory across sessions using Python
Time: 25 min | Level: Intermediate
Why This Happens
LLMs don't store state. When a session ends, the context window is gone. Nothing carries over unless you explicitly save and reload it.
Most frameworks handle in-session memory fine — but cross-session memory is your problem to solve.
Common symptoms:
- Agent re-asks for user's name, preferences, or prior decisions
- No continuity in multi-day workflows
- Users frustrated by repeating themselves
Solution
There are three memory patterns. Use the one that matches your use case.
| Pattern | Best For | Storage |
|---|---|---|
| Buffer | Short sessions, exact recall | DB / file |
| Summary | Long sessions, big picture | DB + LLM |
| Vector search | Large history, semantic retrieval | Vector DB |
Step 1: Set Up Storage
You need somewhere to persist memory between sessions. SQLite is fine to start.
import sqlite3
import json
from datetime import datetime
def get_db():
conn = sqlite3.connect("agent_memory.db")
conn.execute("""
CREATE TABLE IF NOT EXISTS memories (
session_id TEXT,
user_id TEXT,
role TEXT,
content TEXT,
timestamp TEXT
)
""")
conn.commit()
return conn
def save_message(user_id: str, session_id: str, role: str, content: str):
db = get_db()
db.execute(
"INSERT INTO memories VALUES (?, ?, ?, ?, ?)",
(session_id, user_id, role, content, datetime.utcnow().isoformat())
)
db.commit()
def load_history(user_id: str, limit: int = 20) -> list[dict]:
db = get_db()
rows = db.execute(
"SELECT role, content FROM memories WHERE user_id = ? ORDER BY timestamp DESC LIMIT ?",
(user_id, limit)
).fetchall()
# Reverse so oldest messages come first
return [{"role": r[0], "content": r[1]} for r in reversed(rows)]
Expected: A agent_memory.db file created on first run.
Step 2: Buffer Memory (Simple Recall)
Reload the last N messages and prepend them to every new request.
import anthropic
client = anthropic.Anthropic()
def chat_with_memory(user_id: str, session_id: str, user_input: str) -> str:
# Load prior history for this user
history = load_history(user_id, limit=20)
# Append the new user message
history.append({"role": "user", "content": user_input})
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system="You are a helpful assistant. Use the conversation history to maintain context.",
messages=history
)
reply = response.content[0].text
# Persist both sides of the exchange
save_message(user_id, session_id, "user", user_input)
save_message(user_id, session_id, "assistant", reply)
return reply
Why this works: The model sees prior exchanges just like a continuing conversation, even though they came from a different session.
If it fails:
- Context too long error: Reduce
limitor switch to summary memory below - Wrong user history loading: Double-check you're filtering by
user_id, not justsession_id
Step 3: Summary Memory (Long-Running Agents)
Buffer memory hits token limits fast. Summary memory compresses old context into a paragraph and prepends that instead.
def summarize_history(history: list[dict]) -> str:
if not history:
return ""
formatted = "\n".join(f"{m['role'].upper()}: {m['content']}" for m in history)
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=300,
messages=[{
"role": "user",
"content": (
f"Summarize this conversation history in 3-5 sentences. "
f"Focus on decisions made, user preferences, and open tasks.\n\n{formatted}"
)
}]
)
return response.content[0].text
def chat_with_summary_memory(user_id: str, session_id: str, user_input: str) -> str:
history = load_history(user_id, limit=50)
summary = summarize_history(history)
messages = []
if summary:
# Inject summary as context before the live message
messages.append({
"role": "user",
"content": f"[Context from prior sessions]: {summary}"
})
messages.append({
"role": "assistant",
"content": "Understood. I'll keep that context in mind."
})
messages.append({"role": "user", "content": user_input})
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system="You are a helpful assistant with memory of prior interactions.",
messages=messages
)
reply = response.content[0].text
save_message(user_id, session_id, "user", user_input)
save_message(user_id, session_id, "assistant", reply)
return reply
Why this works: Summaries are 10-20x smaller than raw history, so you can cover weeks of conversation in a few hundred tokens.
Step 4: Vector Search Memory (Large or Multi-Topic History)
For agents with extensive history, use embeddings to retrieve only the relevant past context — not all of it.
pip install chromadb anthropic
import chromadb
from chromadb.utils import embedding_functions
# Use a local embedding model or swap for OpenAI/Cohere
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2"
)
chroma = chromadb.PersistentClient(path="./chroma_memory")
collection = chroma.get_or_create_collection("agent_memory", embedding_function=ef)
def store_memory(user_id: str, text: str, metadata: dict = {}):
doc_id = f"{user_id}_{datetime.utcnow().timestamp()}"
collection.add(
documents=[text],
metadatas=[{"user_id": user_id, **metadata}],
ids=[doc_id]
)
def retrieve_relevant_memory(user_id: str, query: str, top_k: int = 5) -> list[str]:
results = collection.query(
query_texts=[query],
n_results=top_k,
where={"user_id": user_id}
)
return results["documents"][0] if results["documents"] else []
def chat_with_vector_memory(user_id: str, session_id: str, user_input: str) -> str:
# Fetch semantically relevant past exchanges
relevant = retrieve_relevant_memory(user_id, user_input)
context = "\n".join(relevant)
system_prompt = "You are a helpful assistant."
if context:
system_prompt += f"\n\nRelevant past context:\n{context}"
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=system_prompt,
messages=[{"role": "user", "content": user_input}]
)
reply = response.content[0].text
# Store this exchange for future retrieval
store_memory(user_id, f"User: {user_input}\nAssistant: {reply}", {"session": session_id})
return reply
Why this works: Instead of dumping all history into the prompt, you embed the user's current question and pull only the most semantically similar past exchanges. Fast and token-efficient.
If it fails:
- ChromaDB import error: Run
pip install chromadb --upgrade - Empty results on first run: Expected — the store is empty until previous exchanges have been saved
Verification
Run a quick two-session test:
# Session 1
reply1 = chat_with_memory("user_42", "session_001", "My name is Dana and I prefer dark mode.")
print(reply1)
# Simulate a new session
reply2 = chat_with_memory("user_42", "session_002", "What UI preference did I mention?")
print(reply2)
# Expected: Agent recalls "dark mode" preference from session_001
You should see: The agent correctly references Dana's dark mode preference in the second session without being told again.
What You Learned
- LLMs are stateless — persistence is always your responsibility
- Buffer memory is the simplest approach but hits token limits quickly
- Summary memory compresses history, trading exact recall for scale
- Vector search retrieves relevant context without loading everything — best for production agents with long-running users
Limitation: Summary memory loses specific details. If exact recall matters (e.g., a user's account number), use buffer or store structured data separately alongside your memory layer.
When NOT to use this: Single-turn tools or stateless APIs where continuity doesn't add value — the overhead isn't worth it.
Tested on Python 3.12, anthropic SDK 0.40+, ChromaDB 0.5+, Ubuntu 24 & macOS Sequoia