Improve websocket message coalescing to handle thundering herds better (#118268)

* Increase websocket peak messages to match max expected entities During startup the websocket would frequently disconnect if more than 4096 entities were added back to back. Some MQTT setups will have more than 10000 entities. Match the websocket peak value to the max expected entities * coalesce more * delay more if the backlog gets large * wait to send if the queue is building rapidly * tweak * tweak for chrome since it works great in firefox but chrome cannot handle it * Revert "tweak for chrome since it works great in firefox but chrome cannot handle it" This reverts commit 439e2d76b1. * adjust for chrome * lower number * remove code * fixes * fast path for bytes * compact * adjust test since we see the close right away now on overload * simplify check * reduce loop * tweak * handle ready right away
2024-05-28 17:14:06 -10:00 · 2024-05-28 17:14:06 -10:00 · 79bc179ce8
commit 79bc179ce8
parent b94bf1f214
5 changed files with 124 additions and 69 deletions
--- a/homeassistant/components/websocket_api/http.py
+++ b/homeassistant/components/websocket_api/http.py
@ -24,6 +24,7 @@ from .auth import AUTH_REQUIRED_MESSAGE, AuthPhase
 from .const import (
    DATA_CONNECTIONS,
    MAX_PENDING_MSG,
+    PENDING_MSG_MAX_FORCE_READY,
    PENDING_MSG_PEAK,
    PENDING_MSG_PEAK_TIME,
    SIGNAL_WEBSOCKET_CONNECTED,
@ -67,6 +68,7 @@ class WebSocketHandler:

    __slots__ = (
        "_hass",
+        "_loop",
        "_request",
        "_wsock",
        "_handle_task",
@ -78,11 +80,13 @@ class WebSocketHandler:
        "_connection",
        "_message_queue",
        "_ready_future",
+        "_release_ready_queue_size",
    )

    def __init__(self, hass: HomeAssistant, request: web.Request) -> None:
        """Initialize an active connection."""
        self._hass = hass
+        self._loop = hass.loop
        self._request: web.Request = request
        self._wsock = web.WebSocketResponse(heartbeat=55)
        self._handle_task: asyncio.Task | None = None
@ -97,8 +101,9 @@ class WebSocketHandler:
        # to where messages are queued. This allows the implementation
        # to use a deque and an asyncio.Future to avoid the overhead of
        # an asyncio.Queue.
-        self._message_queue: deque[bytes | None] = deque()
-        self._ready_future: asyncio.Future[None] | None = None
+        self._message_queue: deque[bytes] = deque()
+        self._ready_future: asyncio.Future[int] | None = None
+        self._release_ready_queue_size: int = 0

    def __repr__(self) -> str:
        """Return the representation."""
@ -126,45 +131,35 @@ class WebSocketHandler:
        message_queue = self._message_queue
        logger = self._logger
        wsock = self._wsock
-        loop = self._hass.loop
+        loop = self._loop
+        is_debug_log_enabled = partial(logger.isEnabledFor, logging.DEBUG)
        debug = logger.debug
-        is_enabled_for = logger.isEnabledFor
-        logging_debug = logging.DEBUG
+        can_coalesce = self._connection and self._connection.can_coalesce
+        ready_message_count = len(message_queue)
        # Exceptions if Socket disconnected or cancelled by connection handler
        try:
            while not wsock.closed:
-                if (messages_remaining := len(message_queue)) == 0:
+                if not message_queue:
                    self._ready_future = loop.create_future()
-                    await self._ready_future
-                    messages_remaining = len(message_queue)
+                    ready_message_count = await self._ready_future

-                # A None message is used to signal the end of the connection
-                if (message := message_queue.popleft()) is None:
+                if self._closing:
                    return

-                debug_enabled = is_enabled_for(logging_debug)
-                messages_remaining -= 1
+                if not can_coalesce:
+                    # coalesce may be enabled later in the connection
+                    can_coalesce = self._connection and self._connection.can_coalesce

-                if (
-                    not messages_remaining
-                    or not (connection := self._connection)
-                    or not connection.can_coalesce
-                ):
-                    if debug_enabled:
+                if not can_coalesce or ready_message_count == 1:
+                    message = message_queue.popleft()
+                    if is_debug_log_enabled():
                        debug("%s: Sending %s", self.description, message)
                    await send_bytes_text(message)
                    continue

-                messages: list[bytes] = [message]
-                while messages_remaining:
-                    # A None message is used to signal the end of the connection
-                    if (message := message_queue.popleft()) is None:
-                        return
-                    messages.append(message)
-                    messages_remaining -= 1
-
-                coalesced_messages = b"".join((b"[", b",".join(messages), b"]"))
-                if debug_enabled:
+                coalesced_messages = b"".join((b"[", b",".join(message_queue), b"]"))
+                message_queue.clear()
+                if is_debug_log_enabled():
                    debug("%s: Sending %s", self.description, coalesced_messages)
                await send_bytes_text(coalesced_messages)
        except asyncio.CancelledError:
@ -197,14 +192,15 @@ class WebSocketHandler:
            # max pending messages.
            return

-        if isinstance(message, dict):
-            message = message_to_json_bytes(message)
-        elif isinstance(message, str):
-            message = message.encode("utf-8")
+        if type(message) is not bytes:  # noqa: E721
+            if isinstance(message, dict):
+                message = message_to_json_bytes(message)
+            elif isinstance(message, str):
+                message = message.encode("utf-8")

        message_queue = self._message_queue
-        queue_size_before_add = len(message_queue)
-        if queue_size_before_add >= MAX_PENDING_MSG:
+        message_queue.append(message)
+        if (queue_size_after_add := len(message_queue)) >= MAX_PENDING_MSG:
            self._logger.error(
                (
                    "%s: Client unable to keep up with pending messages. Reached %s pending"
@ -218,14 +214,14 @@ class WebSocketHandler:
            self._cancel()
            return

-        message_queue.append(message)
-        ready_future = self._ready_future
-        if ready_future and not ready_future.done():
-            ready_future.set_result(None)
+        if self._release_ready_queue_size == 0:
+            # Try to coalesce more messages to reduce the number of writes
+            self._release_ready_queue_size = queue_size_after_add
+            self._loop.call_soon(self._release_ready_future_or_reschedule)

        peak_checker_active = self._peak_checker_unsub is not None

-        if queue_size_before_add <= PENDING_MSG_PEAK:
+        if queue_size_after_add <= PENDING_MSG_PEAK:
            if peak_checker_active:
                self._cancel_peak_checker()
            return
@ -235,6 +231,32 @@ class WebSocketHandler:
                self._hass, PENDING_MSG_PEAK_TIME, self._check_write_peak
            )

+    @callback
+    def _release_ready_future_or_reschedule(self) -> None:
+        """Release the ready future or reschedule.
+
+        We will release the ready future if the queue did not grow since the
+        last time we tried to release the ready future.
+
+        If we reach PENDING_MSG_MAX_FORCE_READY, we will release the ready future
+        immediately so avoid the coalesced messages from growing too large.
+        """
+        if not (ready_future := self._ready_future) or not (
+            queue_size := len(self._message_queue)
+        ):
+            self._release_ready_queue_size = 0
+            return
+        # If we are below the max pending to force ready, and there are new messages
+        # in the queue since the last time we tried to release the ready future, we
+        # try again later so we can coalesce more messages.
+        if queue_size > self._release_ready_queue_size < PENDING_MSG_MAX_FORCE_READY:
+            self._release_ready_queue_size = queue_size
+            self._loop.call_soon(self._release_ready_future_or_reschedule)
+            return
+        self._release_ready_queue_size = 0
+        if not ready_future.done():
+            ready_future.set_result(queue_size)
+
    @callback
    def _check_write_peak(self, _utc_time: dt.datetime) -> None:
        """Check that we are no longer above the write peak."""
@ -440,10 +462,8 @@ class WebSocketHandler:
                connection.async_handle_close()

            self._closing = True
-
-            self._message_queue.append(None)
            if self._ready_future and not self._ready_future.done():
-                self._ready_future.set_result(None)
+                self._ready_future.set_result(len(self._message_queue))

            # If the writer gets canceled we still need to close the websocket
            # so we have another finally block to make sure we close the websocket