Resilient message processing playbook

Checklist and code snippets for keeping queues flowing when consumers misbehave.

Neon message board

The fastest way to break a distributed system is to ignore how messages fail. We keep consumers predictable by codifying retries, backpressure, and observability in one playbook.

Isolate retries from DLQs

Send retriable errors back to the queue with jittered delays. Only poison messages land in the dead-letter queue.

const RETRYABLE = new Set(["ECONNRESET", "ETIMEDOUT"]);

export async function handle(message: Job) {
  try {
    await process(message);
  } catch (err) {
    const code = getCode(err);
    if (RETRYABLE.has(code)) {
      const delay = Math.min(30000, Math.random() * 5000 + 2000);
      await queue.nack(message, { requeue: true, delayMs: delay });
      return;
    }

    await dlq.publish({ ...message, error: code });
  }
}

Cap in-flight work with tokens

Consumer autoscaling is great until one pod grabs too much work. Use a token bucket to enforce concurrency limits.

var tokens = make(chan struct{}, 32)

func handle(msg Message) {
  tokens <- struct{}{}
  go func() {
    defer func() { <-tokens }()
    process(msg)
  }()
}
# surface head-of-line blocking
queue stat --topic invoices --format table

# tokens should roughly equal consumer replicas * handler limit

Attach context-rich logs

If a message lands in the DLQ, the breadcrumbs should already exist.

import structlog
log = structlog.get_logger()

async def process_message(msg: dict):
    with log.bind(message_id=msg["id"], tenant=msg.get("tenant")):
        log.info("message.received")
        result = await run_work(msg)
        log.info("message.done", latency_ms=result.latency_ms)
-- query DLQ volume per tenant
select tenant, count(*) as failures_last_hour
from dlq_events
where ts > now() - interval '1 hour'
group by tenant
order by failures_last_hour desc;

Keep replays boring

Reprocessing should be idempotent by default. Protect your handlers with idempotency keys and external locks.

class Handler
  def call(event)
    lock_key = "replay:#{event.id}"
    with_lock(lock_key, ttl: 60) do
      return if seen?(event.id)

      persist(event)
      produce(event.next_step)
    end
  end
end
# spot check duplicates during a replay
rg "replay" logs/consumer.log | awk '{print $5}' | sort | uniq -c | sort -nr | head

Queues rarely fail loudly. By codifying these patterns, you can keep latency predictable even when downstreams are having a rough day.