API Reference¶
HeadroomClient¶
The main entry point for Headroom SDK.
from headroom import HeadroomClient
from openai import OpenAI
client = HeadroomClient(
original_client=OpenAI(),
default_mode="optimize",
)
Constructor Parameters¶
| Parameter | Type | Default | Description |
|---|---|---|---|
original_client |
OpenAI \| Anthropic |
Required | The underlying LLM client |
provider |
Provider |
Auto-detected | Token counting provider |
default_mode |
str |
"audit" |
Default mode: "audit", "optimize", "off" |
store_url |
str |
None |
Storage URL for metrics |
smart_crusher_config |
SmartCrusherConfig |
Default | Compression settings |
cache_aligner_config |
CacheAlignerConfig |
Default | Cache alignment settings |
rolling_window_config |
RollingWindowConfig |
Default | Context window settings |
Methods¶
chat.completions.create(**kwargs)¶
Create a chat completion with optional optimization.
response = client.chat.completions.create(
model="gpt-4o",
messages=[...],
headroom_mode="optimize", # Override default mode
)
Additional Parameters:
| Parameter | Type | Description |
|---|---|---|
headroom_mode |
str |
Override mode for this request |
headroom_query |
str |
Query for relevance scoring |
chat.completions.simulate(**kwargs)¶
Preview optimization without making an API call.
plan = client.chat.completions.simulate(
model="gpt-4o",
messages=[...],
)
print(f"Tokens before: {plan.tokens_before}")
print(f"Tokens after: {plan.tokens_after}")
print(f"Savings: {plan.savings_percent:.1f}%")
Returns: SimulationResult
Configuration Classes¶
SmartCrusherConfig¶
from headroom import SmartCrusherConfig
config = SmartCrusherConfig(
min_tokens_to_crush=200,
max_items_after_crush=50,
keep_first=3,
keep_last=2,
relevance_threshold=0.3,
anomaly_std_threshold=2.0,
preserve_errors=True,
)
CacheAlignerConfig¶
from headroom import CacheAlignerConfig
config = CacheAlignerConfig(
extract_dates=True,
normalize_whitespace=True,
stable_prefix_min_tokens=100,
)
RollingWindowConfig¶
from headroom import RollingWindowConfig
config = RollingWindowConfig(
max_tokens=100000,
preserve_system=True,
preserve_recent_turns=5,
drop_oldest_first=True,
)
IntelligentContextConfig¶
from headroom.config import IntelligentContextConfig, ScoringWeights
weights = ScoringWeights(
recency=0.20,
semantic_similarity=0.20,
toin_importance=0.25,
error_indicator=0.15,
forward_reference=0.15,
token_density=0.05,
)
config = IntelligentContextConfig(
enabled=True,
keep_system=True,
keep_last_turns=2,
output_buffer_tokens=4000,
use_importance_scoring=True,
scoring_weights=weights,
toin_integration=True,
recency_decay_rate=0.1,
compress_threshold=0.1,
)
ScoringWeights¶
from headroom.config import ScoringWeights
weights = ScoringWeights(
recency=0.20, # Exponential decay from end
semantic_similarity=0.20, # Embedding similarity to recent context
toin_importance=0.25, # TOIN retrieval_rate
error_indicator=0.15, # TOIN field_semantics error detection
forward_reference=0.15, # Messages referenced by later messages
token_density=0.05, # Unique/total token ratio
)
# Weights are auto-normalized to sum to 1.0
normalized = weights.normalized()
RelevanceScorerConfig¶
from headroom import RelevanceScorerConfig
config = RelevanceScorerConfig(
scorer_type="bm25", # "bm25", "embedding", or "hybrid"
embedding_model=None, # Model name for embedding scorer
hybrid_alpha=0.5, # Weight for hybrid scoring
)
Data Models¶
SimulationResult¶
Returned by simulate().
@dataclass
class SimulationResult:
tokens_before: int
tokens_after: int
tokens_saved: int
savings_percent: float
transforms_applied: list[str]
waste_signals: WasteSignals
RequestMetrics¶
Metrics for a single request.
@dataclass
class RequestMetrics:
request_id: str
timestamp: datetime
model: str
tokens_input_before: int
tokens_input_after: int
tokens_output: int
cost_before: float
cost_after: float
transforms_applied: list[str]
WasteSignals¶
Detected waste in the request.
@dataclass
class WasteSignals:
json_bloat_tokens: int
html_noise_tokens: int
whitespace_tokens: int
dynamic_date_tokens: int
repetition_tokens: int
Providers¶
OpenAIProvider¶
from headroom import OpenAIProvider
provider = OpenAIProvider()
# Get token counter
counter = provider.get_token_counter("gpt-4o")
tokens = counter.count_text("Hello, world!")
# Get context limit
limit = provider.get_context_limit("gpt-4o") # 128000
# Estimate cost
cost = provider.estimate_cost(
input_tokens=1000,
output_tokens=500,
model="gpt-4o",
)
AnthropicProvider¶
from headroom import AnthropicProvider
from anthropic import Anthropic
provider = AnthropicProvider(client=Anthropic())
counter = provider.get_token_counter("claude-3-5-sonnet-latest")
tokens = counter.count_messages(messages) # Accurate count via API
Relevance Scoring¶
BM25Scorer¶
Fast keyword-based scoring (zero dependencies).
from headroom import BM25Scorer
scorer = BM25Scorer()
scores = scorer.score_items(
items=["item 1", "item 2", ...],
query="search query",
)
EmbeddingScorer¶
Semantic similarity scoring (requires sentence-transformers).
from headroom import EmbeddingScorer, embedding_available
if embedding_available():
scorer = EmbeddingScorer(model="all-MiniLM-L6-v2")
scores = scorer.score_items(items, query)
HybridScorer¶
Combines BM25 and embeddings.
from headroom import HybridScorer
scorer = HybridScorer(alpha=0.5) # 50% BM25, 50% embedding
scores = scorer.score_items(items, query)
create_scorer()¶
Factory function to create scorers.
from headroom import create_scorer
# Auto-select best available scorer
scorer = create_scorer()
# Explicitly choose type
scorer = create_scorer(scorer_type="hybrid", alpha=0.7)
Transforms (Direct Use)¶
SmartCrusher¶
from headroom import SmartCrusher
crusher = SmartCrusher()
result = crusher.crush(
data={"results": [...]},
query="user query",
)
CacheAligner¶
RollingWindow¶
from headroom import RollingWindow
window = RollingWindow(config)
result = window.apply(messages, max_tokens=100000)
IntelligentContextManager¶
from headroom.transforms import IntelligentContextManager
from headroom.config import IntelligentContextConfig
from headroom.telemetry import get_toin
# With TOIN integration for learned patterns
toin = get_toin()
config = IntelligentContextConfig(
keep_system=True,
keep_last_turns=2,
use_importance_scoring=True,
)
manager = IntelligentContextManager(config, toin=toin)
result = manager.apply(messages, tokenizer, model_limit=128000)
# Access scoring details
print(result.transforms_applied) # ["intelligent_cap:3"]
print(result.tokens_before, result.tokens_after)
MessageScorer¶
from headroom.transforms import MessageScorer, MessageScore
from headroom.config import ScoringWeights
scorer = MessageScorer(
weights=ScoringWeights(),
toin=None, # Optional TOIN for learned patterns
embedding_provider=None, # Optional for semantic similarity
recency_decay_rate=0.1,
)
# Score messages
scores: list[MessageScore] = scorer.score_messages(
messages=messages,
protected_indices={0}, # System message
tool_unit_indices={2, 3}, # Tool call + response
)
for score in scores:
print(f"Message {score.message_index}: {score.total_score:.2f}")
print(f" Recency: {score.recency_score:.2f}")
print(f" TOIN: {score.toin_score:.2f}")
print(f" Protected: {score.is_protected}")
TransformPipeline¶
from headroom import TransformPipeline
pipeline = TransformPipeline([
SmartCrusher(),
CacheAligner(),
RollingWindow(),
])
result = pipeline.transform(messages)
Utilities¶
Tokenizer¶
from headroom import Tokenizer, count_tokens_text, count_tokens_messages
# Quick counting
tokens = count_tokens_text("Hello, world!", model="gpt-4o")
# With tokenizer instance
tokenizer = Tokenizer(model="gpt-4o")
tokens = tokenizer.count_text("Hello")
tokens = tokenizer.count_messages(messages)
generate_report()¶
Generate HTML/Markdown reports from stored metrics.