Reliability & Latency
Prevent cascading failures by failing fast when a service is unhealthy
Circuit breaker pattern prevents an application from repeatedly trying to execute an operation that's likely to fail. It monitors for failures and 'opens the circuit' after a threshold, failing fast instead of waiting for timeouts. After a cooldown period, it allows test requests to check if the service has recovered.
Use circuit breakers when calling external services, microservices, or any dependency that might fail or become slow. Essential for preventing cascading failures in distributed systems.
General-purpose circuit breaking for microservices, API calls, and database connections
# Three-State Circuit Breaker Implementation
from enum import Enum
from datetime import datetime, timedelta
import threading
class CircuitState(Enum):
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing fast
HALF_OPEN = "half_open" # Testing recovery
class CircuitBreaker:
def __init__(self, failure_threshold=5, timeout=60, recovery_timeout=30):
self.failure_threshold = failure_threshold
self.timeout = timeout # seconds before trying half-open
self.recovery_timeout = recovery_timeout
self.failure_count = 0
self.last_failure_time = None
self.state = CircuitState.CLOSED
self.lock = threading.Lock()
def call(self, func, *args, **kwargs):
with self.lock:
if self.state == CircuitState.OPEN:
# Check if timeout has passed
if datetime.now() - self.last_failure_time > timedelta(seconds=self.timeout):
self.state = CircuitState.HALF_OPEN
print("Circuit breaker entering HALF_OPEN state")
else:
raise Exception("Circuit breaker is OPEN - failing fast")
try:
result = func(*args, **kwargs)
self._on_success()
return result
except Exception as e:
self._on_failure()
raise e
def _on_success(self):
with self.lock:
if self.state == CircuitState.HALF_OPEN:
# Recovery successful
self.state = CircuitState.CLOSED
self.failure_count = 0
print("Circuit breaker CLOSED - service recovered")
elif self.state == CircuitState.CLOSED:
# Reset failure count on success
self.failure_count = 0
def _on_failure(self):
with self.lock:
self.failure_count += 1
self.last_failure_time = datetime.now()
if self.state == CircuitState.HALF_OPEN:
# Recovery failed, go back to open
self.state = CircuitState.OPEN
print("Circuit breaker OPEN - recovery failed")
elif self.failure_count >= self.failure_threshold:
# Too many failures, open the circuit
self.state = CircuitState.OPEN
print(f"Circuit breaker OPEN - {self.failure_count} failures")
# Usage
breaker = CircuitBreaker(failure_threshold=3, timeout=30)
def call_external_api():
try:
return breaker.call(external_service.get_data)
except Exception as e:
# Provide fallback response
return get_cached_data()
High-scale systems with variable traffic, services with changing performance characteristics
# Adaptive Circuit Breaker with Sliding Window
from collections import deque
from datetime import datetime, timedelta
class AdaptiveCircuitBreaker:
def __init__(self, window_size=100, error_threshold_percent=50,
latency_threshold_ms=1000):
self.window_size = window_size
self.error_threshold_percent = error_threshold_percent
self.latency_threshold_ms = latency_threshold_ms
self.requests = deque(maxlen=window_size)
self.state = CircuitState.CLOSED
def call(self, func, *args, **kwargs):
if self.state == CircuitState.OPEN:
if self._should_attempt_reset():
self.state = CircuitState.HALF_OPEN
else:
raise Exception("Circuit breaker is OPEN")
start_time = datetime.now()
try:
result = func(*args, **kwargs)
latency = (datetime.now() - start_time).total_seconds() * 1000
self._record_success(latency)
return result
except Exception as e:
self._record_failure()
raise e
def _record_success(self, latency_ms):
self.requests.append({
'success': True,
'latency': latency_ms,
'timestamp': datetime.now()
})
if self.state == CircuitState.HALF_OPEN:
self.state = CircuitState.CLOSED
self._evaluate_state()
def _record_failure(self):
self.requests.append({
'success': False,
'timestamp': datetime.now()
})
self._evaluate_state()
def _evaluate_state(self):
if len(self.requests) < self.window_size:
return # Not enough data
# Calculate error rate
errors = sum(1 for r in self.requests if not r['success'])
error_rate = (errors / len(self.requests)) * 100
# Calculate P95 latency
latencies = [r['latency'] for r in self.requests if r['success']]
if latencies:
latencies.sort()
p95_latency = latencies[int(len(latencies) * 0.95)]
else:
p95_latency = float('inf')
# Open circuit if thresholds exceeded
if (error_rate > self.error_threshold_percent or
p95_latency > self.latency_threshold_ms):
self.state = CircuitState.OPEN
print(f"Circuit OPEN - Error rate: {error_rate}%, P95: {p95_latency}ms")
Hystrix library implements circuit breakers for all microservice calls
Thousands of microservices, millions of requests per secondCircuit breakers protect against cascading failures across AWS services
Global infrastructure with millions of service callsCircuit breakers prevent driver matching failures from affecting entire platform
Millions of rides per day across 70+ countriesPer-service instance - Each instance maintains its own state
Medium to High - Requires careful tuning and monitoring
Low - Minimal overhead, prevents expensive failures