ArchitectureStaff
Staff Prep 12: Rate Limiting & Throttling — Token Bucket, Sliding Window & Redis
April 4, 20269 min readPART 10 / 18
Back to Part 11: Pagination & Filtering. Rate limiting protects your API from abuse, accidental thundering herds, and runaway clients. But in-process counters break the moment you run more than one worker. Redis-backed distributed rate limiting — with atomic operations — is the only implementation that holds under production conditions.
Why in-process rate limiting fails
python
from collections import defaultdict
import time
# Broken: in-process counter
request_counts = defaultdict(int)
window_start = defaultdict(float)
def check_rate_limit_broken(user_id: str, limit: int = 100) -> bool:
now = time.time()
if now - window_start[user_id] > 60:
request_counts[user_id] = 0
window_start[user_id] = now
request_counts[user_id] += 1
return request_counts[user_id] <= limit
# With 4 Uvicorn workers:
# - Each worker has its own request_counts dict
# - Each worker allows 100 req/min independently
# - Effective limit = 400 req/min, not 100
# - No enforcement across workers
Algorithm 1: fixed window counter (Redis)
python
import redis.asyncio as redis
import time
r = redis.Redis(host="localhost", port=6379, decode_responses=True)
async def check_fixed_window(user_id: str, limit: int = 100, window_seconds: int = 60) -> bool:
now = int(time.time())
window = now - (now % window_seconds) # round to window boundary
key = f"ratelimit:{user_id}:{window}"
count = await r.incr(key)
if count == 1:
await r.expire(key, window_seconds * 2) # cleanup
return count <= limit
# Problem: burst at window boundaries
# At 59s: 100 requests (uses full window)
# At 60s: window resets to 0
# At 61s: another 100 requests
# Effective burst: 200 requests in 2 seconds
Algorithm 2: sliding window log
python
async def check_sliding_window(user_id: str, limit: int = 100, window_seconds: int = 60) -> bool:
now = time.time()
window_start = now - window_seconds
key = f"ratelimit:sliding:{user_id}"
async with r.pipeline(transaction=True) as pipe:
pipe.zremrangebyscore(key, 0, window_start) # remove old entries
pipe.zadd(key, {str(now): now}) # add current request
pipe.zcard(key) # count in window
pipe.expire(key, window_seconds * 2)
results = await pipe.execute()
count = results[2]
return count <= limit
# Accurate: counts requests in exact rolling 60-second window
# Expensive: stores one entry per request (memory scales with request rate)
# At 1000 req/min: 1000 entries per user in the sorted set
Algorithm 3: token bucket (recommended)
Token bucket allows controlled bursting while enforcing average rate. A bucket holds up to
capacity tokens. Tokens refill at rate per second. Each request
consumes one token. No token = rate limited.
python
import redis.asyncio as redis
import time
import math
r = redis.Redis(host="localhost", port=6379, decode_responses=True)
# Lua script for atomic token bucket (runs atomically in Redis)
TOKEN_BUCKET_SCRIPT = """
local key = KEYS[1]
local capacity = tonumber(ARGV[1])
local rate = tonumber(ARGV[2])
local now = tonumber(ARGV[3])
local requested = tonumber(ARGV[4])
local last_time = tonumber(redis.call('hget', key, 'last_time') or now)
local tokens = tonumber(redis.call('hget', key, 'tokens') or capacity)
-- Refill tokens based on elapsed time
local elapsed = now - last_time
local new_tokens = math.min(capacity, tokens + elapsed * rate)
if new_tokens >= requested then
redis.call('hset', key, 'tokens', new_tokens - requested)
redis.call('hset', key, 'last_time', now)
redis.call('expire', key, 3600)
return 1 -- allowed
else
redis.call('hset', key, 'tokens', new_tokens)
redis.call('hset', key, 'last_time', now)
redis.call('expire', key, 3600)
return 0 -- denied
end
"""
token_bucket_lua = None
async def check_token_bucket(
user_id: str,
capacity: int = 60, # max burst: 60 requests
rate: float = 1.0, # refill: 1 token/second = 60/min average
) -> bool:
global token_bucket_lua
if token_bucket_lua is None:
token_bucket_lua = r.register_script(TOKEN_BUCKET_SCRIPT)
key = f"ratelimit:tb:{user_id}"
now = time.time()
result = await token_bucket_lua(keys=[key], args=[capacity, rate, now, 1])
return bool(result)
FastAPI middleware integration
python
from fastapi import Request, Response
from fastapi.middleware.base import BaseHTTPMiddleware
import time
class RateLimitMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
# Extract user identity (prefer user_id, fall back to IP)
user_id = getattr(request.state, "user_id", None)
identifier = f"user:{user_id}" if user_id else f"ip:{request.client.host}"
allowed = await check_token_bucket(
identifier,
capacity=60 if user_id else 10, # authenticated users get more tokens
rate=1.0 if user_id else 0.2,
)
if not allowed:
return Response(
content='{"error":"rate_limit_exceeded","retry_after":60}',
status_code=429,
headers={
"Content-Type": "application/json",
"Retry-After": "60",
"X-RateLimit-Limit": "60",
},
)
response = await call_next(request)
return response
app.add_middleware(RateLimitMiddleware)
Tiered rate limits for API products
python
from enum import Enum
class PlanTier(str, Enum):
free = "free"
pro = "pro"
enterprise = "enterprise"
TIER_LIMITS = {
PlanTier.free: {"capacity": 10, "rate": 0.17}, # 10/min
PlanTier.pro: {"capacity": 100, "rate": 1.67}, # 100/min
PlanTier.enterprise: {"capacity": 1000, "rate": 16.7}, # 1000/min
}
async def check_rate_limit_tiered(user: User) -> bool:
tier = user.subscription_tier or PlanTier.free
limits = TIER_LIMITS[tier]
return await check_token_bucket(
f"user:{user.id}",
capacity=limits["capacity"],
rate=limits["rate"],
)
Quiz: test your understanding
Before moving on, answer these in your head (or out loud):
- You have 8 Uvicorn workers each with an in-process rate limiter set to 100 req/min per user. What is the actual effective limit? How do you fix it?
- What is the "burst at boundary" problem with fixed-window rate limiting? Does the token bucket algorithm solve it? How?
- The Lua script for token bucket uses atomic Redis execution. Why is this necessary? What goes wrong without atomicity?
- A user sends 60 requests in the first second, then stops. With a token bucket (capacity=60, rate=1/sec), when can they send another 60 requests? When with a fixed 60/min window?
- How do you return rate limit information in HTTP headers so clients can implement smart backoff? What headers are standard?
Next up — Part 13: Caching Strategies & Redis. Cache-aside, write-through, thundering herd, and invalidation patterns.
Share this