Resilient message processing playbook
Checklist and code snippets for keeping queues flowing when consumers misbehave.
The fastest way to break a distributed system is to ignore how messages fail. We keep consumers predictable by codifying retries, backpressure, and observability in one playbook.
Isolate retries from DLQs
Send retriable errors back to the queue with jittered delays. Only poison messages land in the dead-letter queue.
const RETRYABLE = new Set(["ECONNRESET", "ETIMEDOUT"]);
export async function handle(message: Job) {
try {
await process(message);
} catch (err) {
const code = getCode(err);
if (RETRYABLE.has(code)) {
const delay = Math.min(30000, Math.random() * 5000 + 2000);
await queue.nack(message, { requeue: true, delayMs: delay });
return;
}
await dlq.publish({ ...message, error: code });
}
}
Cap in-flight work with tokens
Consumer autoscaling is great until one pod grabs too much work. Use a token bucket to enforce concurrency limits.
var tokens = make(chan struct{}, 32)
func handle(msg Message) {
tokens <- struct{}{}
go func() {
defer func() { <-tokens }()
process(msg)
}()
}
# surface head-of-line blocking
queue stat --topic invoices --format table
# tokens should roughly equal consumer replicas * handler limit
Attach context-rich logs
If a message lands in the DLQ, the breadcrumbs should already exist.
import structlog
log = structlog.get_logger()
async def process_message(msg: dict):
with log.bind(message_id=msg["id"], tenant=msg.get("tenant")):
log.info("message.received")
result = await run_work(msg)
log.info("message.done", latency_ms=result.latency_ms)
-- query DLQ volume per tenant
select tenant, count(*) as failures_last_hour
from dlq_events
where ts > now() - interval '1 hour'
group by tenant
order by failures_last_hour desc;
Keep replays boring
Reprocessing should be idempotent by default. Protect your handlers with idempotency keys and external locks.
class Handler
def call(event)
lock_key = "replay:#{event.id}"
with_lock(lock_key, ttl: 60) do
return if seen?(event.id)
persist(event)
produce(event.next_step)
end
end
end
# spot check duplicates during a replay
rg "replay" logs/consumer.log | awk '{print $5}' | sort | uniq -c | sort -nr | head
Queues rarely fail loudly. By codifying these patterns, you can keep latency predictable even when downstreams are having a rough day.