fix(websocket): order register/unregister via single ops channel

Two separate channels under one select gave Go's randomness the chance
to process an unregister before its matching register from the same
goroutine, leaking the entry into the client map. Replace with a single
ordered ops channel so program order is preserved end-to-end.
This commit is contained in:
MHSanaei 2026-05-19 12:34:53 +02:00
parent 85e2ded0e1
commit 6000bc7134
No known key found for this signature in database
GPG key ID: 7E4060F2FBE5AB7A

View file

@ -29,11 +29,23 @@ const (
enqueueTimeout = 100 * time.Millisecond enqueueTimeout = 100 * time.Millisecond
clientSendQueue = 512 // ~50s of buffering for a momentarily slow browser. clientSendQueue = 512 // ~50s of buffering for a momentarily slow browser.
hubBroadcastQueue = 2048 // Headroom for cron-storm + admin-mutation bursts. hubBroadcastQueue = 2048 // Headroom for cron-storm + admin-mutation bursts.
hubControlQueue = 64 // Backlog for register/unregister bursts (page reloads, disconnect storms). hubOpsQueue = 128 // Backlog for register+unregister bursts (page reloads, disconnect storms).
minBroadcastInterval = 250 * time.Millisecond minBroadcastInterval = 250 * time.Millisecond
hubRestartAttempts = 3 hubRestartAttempts = 3
) )
type clientOpKind int
const (
opRegister clientOpKind = iota
opUnregister
)
type clientOp struct {
kind clientOpKind
c *Client
}
// NewClient builds a Client ready for hub registration. // NewClient builds a Client ready for hub registration.
func NewClient(id string) *Client { func NewClient(id string) *Client {
return &Client{ return &Client{
@ -60,8 +72,7 @@ type Client struct {
type Hub struct { type Hub struct {
clients map[*Client]struct{} clients map[*Client]struct{}
broadcast chan []byte broadcast chan []byte
register chan *Client ops chan clientOp
unregister chan *Client
mu sync.RWMutex mu sync.RWMutex
ctx context.Context ctx context.Context
cancel context.CancelFunc cancel context.CancelFunc
@ -76,8 +87,7 @@ func NewHub() *Hub {
return &Hub{ return &Hub{
clients: make(map[*Client]struct{}), clients: make(map[*Client]struct{}),
broadcast: make(chan []byte, hubBroadcastQueue), broadcast: make(chan []byte, hubBroadcastQueue),
register: make(chan *Client, hubControlQueue), ops: make(chan clientOp, hubOpsQueue),
unregister: make(chan *Client, hubControlQueue),
ctx: ctx, ctx: ctx,
cancel: cancel, cancel: cancel,
lastBroadcast: make(map[MessageType]time.Time), lastBroadcast: make(map[MessageType]time.Time),
@ -145,21 +155,20 @@ func (h *Hub) runOnce() (stopped bool) {
h.shutdown() h.shutdown()
return true return true
case c := <-h.register: case op := <-h.ops:
if c == nil { if op.c == nil {
continue continue
} }
switch op.kind {
case opRegister:
h.mu.Lock() h.mu.Lock()
h.clients[c] = struct{}{} h.clients[op.c] = struct{}{}
n := len(h.clients) n := len(h.clients)
h.mu.Unlock() h.mu.Unlock()
logger.Debugf("WebSocket client connected: %s (total: %d)", c.ID, n) logger.Debugf("WebSocket client connected: %s (total: %d)", op.c.ID, n)
case opUnregister:
case c := <-h.unregister: h.removeClient(op.c)
if c == nil {
continue
} }
h.removeClient(c)
case msg := <-h.broadcast: case msg := <-h.broadcast:
h.fanout(msg) h.fanout(msg)
@ -321,29 +330,29 @@ func (h *Hub) Register(c *Client) {
return return
} }
select { select {
case h.register <- c: case h.ops <- clientOp{kind: opRegister, c: c}:
case <-h.ctx.Done(): case <-h.ctx.Done():
} }
} }
// Unregister removes a client from the hub. Fast path queues for the hub // Unregister removes a client from the hub. Sends through the same ordered
// goroutine; if the channel is saturated (disconnect storm) we fall back // ops channel as Register so a register-then-unregister sequence from one
// to a direct removal under the write lock so dead clients aren't left in // goroutine is processed in program order — otherwise an unregister could
// the registry waiting for their Send buffer to fill (minutes of wasted // land in the map before its register and silently no-op, leaking the entry.
// fanout work at low broadcast rates).
// //
// Direct removal is safe from any caller: external goroutines (read/write // On a saturated ops channel (disconnect storm) we fall back to a bounded
// pumps) hold no hub locks, and the hub goroutine itself never holds h.mu // timeout drop rather than direct removal: a direct delete on a not-yet-
// when it calls Unregister — fanout releases its RLock before per-client // registered client is precisely the ordering bug we fix here. Stragglers
// sends, so we can't self-deadlock here. // get evicted by fanout when their Send buffer fills.
func (h *Hub) Unregister(c *Client) { func (h *Hub) Unregister(c *Client) {
if h == nil || c == nil { if h == nil || c == nil {
return return
} }
select { select {
case h.unregister <- c: case h.ops <- clientOp{kind: opUnregister, c: c}:
default: case <-time.After(enqueueTimeout):
h.removeClient(c) logger.Warningf("WebSocket ops channel full, dropping unregister for %s", c.ID)
case <-h.ctx.Done():
} }
} }