Token-based context management removes messages when the total token count exceeds a predefined limit. Unlike sliding windows that work with message count, this approach considers the actual length of messages.
Message: "Hello, how are you today?"
Tokens: ["Hello", ",", " how", " are", " you", " today", "?"]
Count: 7 tokens
Longer Message: "I'm working on implementing a context management system"
Tokens: ["I", "'m", " working", " on", " implementing", " a", " context",
" management", " system"]
Count: 9 tokens
class TokenBasedManager:
def __init__(self, max_tokens=4000):
self.max_tokens = max_tokens
self.messages = []
self.tokenizer = get_tokenizer() # GPT tokenizer, etc.
def add_message(self, role, content):
message = {"role": role, "content": content}
self.messages.append(message)
self._trim_context()
def _trim_context(self):
total_tokens = self._count_tokens(self.messages)
while total_tokens > self.max_tokens and len(self.messages) > 1:
# Remove the oldest non-system message
for i, msg in enumerate(self.messages):
if msg['role'] != 'system': # Keep system messages
removed = self.messages.pop(i)
total_tokens -= self._count_tokens([removed])
break
def _count_tokens(self, messages):
text = " ".join([msg['content'] for msg in messages])
return len(self.tokenizer.encode(text))
Initial State (Max: 10 tokens):
┌─────────────────────────────────────┐
│ [User: "Hi"] (2 tokens) │
│ [Bot: "Hello!"] (3 tokens) │
│ [User: "How are you?"] (4 tokens) │
│ Total: 9/10 tokens ✓ │
└─────────────────────────────────────┘
Add Long Message:
┌─────────────────────────────────────┐
│ [User: "Hi"] (2 tokens) │
│ [Bot: "Hello!"] (3 tokens) │
│ [User: "How are you?"] (4 tokens) │
│ [Bot: "I'm implementing context"] (8)│
│ Total: 17/10 tokens ✗ │
└─────────────────────────────────────┘
After Trimming:
┌─────────────────────────────────────┐
│ [User: "How are you?"] (4 tokens) │
│ [Bot: "I'm implementing context"] (8)│
│ Total: 12/10 tokens ✗ │
└─────────────────────────────────────┘
Final State:
┌─────────────────────────────────────┐
│ [Bot: "I'm implementing context"] (8)│
│ Total: 8/10 tokens ✓ │
└─────────────────────────────────────┘
Conversation:
User: "Hi" (2 tokens)
Assistant: "Hello!" (3 tokens)
User: "Can you help me debug this complex React application that uses hooks for state management and context API for global state?" (25 tokens)
When trimming occurs:
- Short messages might be kept
- Long messages get dropped first
- Important information in long messages is lost
Original Exchange:
User: "I have three issues with my code:
1. Memory leak in component A
2. Performance problem with function B
3. UI bug in component C"
Assistant: "Let me help with all three issues..."
After token-based trimming:
User: "UI bug in component C" ← Only this part remains
Assistant: "What UI bug?" ← Lost context of other issues
# Problem: System prompts can be accidentally removed
def bad_trim_logic(messages, max_tokens):
total = count_tokens(messages)
while total > max_tokens:
messages.pop(0) # Removes system prompt too!
total = count_tokens(messages)
return messages
# Better approach:
def good_trim_logic(messages, max_tokens):
system_msgs = [m for m in messages if m['role'] == 'system']
other_msgs = [m for m in messages if m['role'] != 'system']
# Only trim non-system messages
while count_tokens(system_msgs + other_msgs) > max_tokens:
if other_msgs:
other_msgs.pop(0)
else:
break # Can't trim system messages
return system_msgs + other_msgs
Scenario: Many short messages vs few long messages
Many Short Messages:
[Hi] [Hello] [How] [are] [you?] [Great!] [Thanks] [Bye]
Total: 8 messages, ~12 tokens
Few Long Messages:
["I'm working on a complex distributed system..."] (15 tokens)
["The architecture involves microservices..."] (18 tokens)
Token-based dropping treats both the same way, but the context
value differs significantly.
# Each trim operation requires:
# 1. Tokenize all messages
# 2. Count tokens
# 3. Remove messages
# 4. Repeat if still over limit
def count_tokens(messages):
# This is computationally expensive!
total = 0
for msg in messages:
total += len(tokenizer.encode(msg['content']))
return total
# Optimization: Cache token counts
class OptimizedTokenManager:
def __init__(self, max_tokens):
self.max_tokens = max_tokens
self.messages = [] # [(message, token_count), ...]
self.total_tokens = 0
def add_message(self, message):
token_count = len(tokenizer.encode(message['content']))
self.messages.append((message, token_count))
self.total_tokens += token_count
self._trim()
def smart_token_allocation(messages, max_tokens):
# Allocate tokens based on message importance
system_tokens = 100 # Reserve for system prompt
recent_tokens = 500 # Reserve for recent messages
historical_tokens = max_tokens - system_tokens - recent_tokens
system_msgs = [m for m in messages if m['role'] == 'system']
recent_msgs = get_recent_messages(messages, recent_tokens)
historical_msgs = select_important_messages(messages, historical_tokens)
return system_msgs + recent_msgs + historical_msgs
def chunk_long_messages(messages, max_chunk_size=100):
chunked = []
for msg in messages:
if count_tokens([msg]) > max_chunk_size:
# Split long messages into chunks
chunks = split_into_chunks(msg, max_chunk_size)
chunked.extend(chunks)
else:
chunked.append(msg)
return chunked
class TokenAwareSlidingWindow:
def __init__(self, max_tokens):
self.max_tokens = max_tokens
self.messages = []
def add_message(self, message):
self.messages.append(message)
# Remove messages until we fit in token limit
while self._total_tokens() > self.max_tokens:
# Remove oldest non-system message
for i, msg in enumerate(self.messages):
if msg['role'] != 'system':
self.messages.pop(i)
break
When you need precise control over costs:
# Estimate API cost based on tokens
def estimate_cost(tokens, model="gpt-4"):
pricing = {"gpt-4": 0.03/1000, "gpt-3.5": 0.002/1000}
return tokens * pricing[model]
Different models have different token limits:
When processing time is critical:
# Larger context = slower processing
processing_time = base_time + (tokens * time_per_token)
def trim_context(messages, max_tokens):
system_msgs = [m for m in messages if m['role'] == 'system']
other_msgs = [m for m in messages if m['role'] != 'system']
# Only trim non-system messages
while count_tokens(system_msgs + other_msgs) > max_tokens:
if other_msgs:
other_msgs.pop(0)
else:
break # Emergency: even system messages too long
return system_msgs + other_msgs
class ContextMonitor:
def __init__(self, max_tokens):
self.max_tokens = max_tokens
self.usage_history = []
def log_usage(self, current_tokens):
self.usage_history.append(current_tokens)
if current_tokens > self.max_tokens * 0.8:
print("Warning: Approaching token limit!")
Token-based management works best when combined with:
Token-based context management provides precise control over context size but can be computationally expensive and may lose important information in long messages. It's most effective when: