Improve websocket message coalescing to handle thundering herds better (#118268)

* Increase websocket peak messages to match max expected entities

During startup the websocket would frequently disconnect if more than
4096 entities were added back to back. Some MQTT setups will have more
than 10000 entities. Match the websocket peak value to the max expected
entities

* coalesce more

* delay more if the backlog gets large

* wait to send if the queue is building rapidly

* tweak

* tweak for chrome since it works great in firefox but chrome cannot handle it

* Revert "tweak for chrome since it works great in firefox but chrome cannot handle it"

This reverts commit 439e2d76b1.

* adjust for chrome

* lower number

* remove code

* fixes

* fast path for bytes

* compact

* adjust test since we see the close right away now on overload

* simplify check

* reduce loop

* tweak

* handle ready right away
This commit is contained in:
J. Nick Koston 2024-05-28 17:14:06 -10:00 committed by GitHub
parent b94bf1f214
commit 79bc179ce8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 124 additions and 69 deletions

View file

@ -24,6 +24,7 @@ from .auth import AUTH_REQUIRED_MESSAGE, AuthPhase
from .const import (
DATA_CONNECTIONS,
MAX_PENDING_MSG,
PENDING_MSG_MAX_FORCE_READY,
PENDING_MSG_PEAK,
PENDING_MSG_PEAK_TIME,
SIGNAL_WEBSOCKET_CONNECTED,
@ -67,6 +68,7 @@ class WebSocketHandler:
__slots__ = (
"_hass",
"_loop",
"_request",
"_wsock",
"_handle_task",
@ -78,11 +80,13 @@ class WebSocketHandler:
"_connection",
"_message_queue",
"_ready_future",
"_release_ready_queue_size",
)
def __init__(self, hass: HomeAssistant, request: web.Request) -> None:
"""Initialize an active connection."""
self._hass = hass
self._loop = hass.loop
self._request: web.Request = request
self._wsock = web.WebSocketResponse(heartbeat=55)
self._handle_task: asyncio.Task | None = None
@ -97,8 +101,9 @@ class WebSocketHandler:
# to where messages are queued. This allows the implementation
# to use a deque and an asyncio.Future to avoid the overhead of
# an asyncio.Queue.
self._message_queue: deque[bytes | None] = deque()
self._ready_future: asyncio.Future[None] | None = None
self._message_queue: deque[bytes] = deque()
self._ready_future: asyncio.Future[int] | None = None
self._release_ready_queue_size: int = 0
def __repr__(self) -> str:
"""Return the representation."""
@ -126,45 +131,35 @@ class WebSocketHandler:
message_queue = self._message_queue
logger = self._logger
wsock = self._wsock
loop = self._hass.loop
loop = self._loop
is_debug_log_enabled = partial(logger.isEnabledFor, logging.DEBUG)
debug = logger.debug
is_enabled_for = logger.isEnabledFor
logging_debug = logging.DEBUG
can_coalesce = self._connection and self._connection.can_coalesce
ready_message_count = len(message_queue)
# Exceptions if Socket disconnected or cancelled by connection handler
try:
while not wsock.closed:
if (messages_remaining := len(message_queue)) == 0:
if not message_queue:
self._ready_future = loop.create_future()
await self._ready_future
messages_remaining = len(message_queue)
ready_message_count = await self._ready_future
# A None message is used to signal the end of the connection
if (message := message_queue.popleft()) is None:
if self._closing:
return
debug_enabled = is_enabled_for(logging_debug)
messages_remaining -= 1
if not can_coalesce:
# coalesce may be enabled later in the connection
can_coalesce = self._connection and self._connection.can_coalesce
if (
not messages_remaining
or not (connection := self._connection)
or not connection.can_coalesce
):
if debug_enabled:
if not can_coalesce or ready_message_count == 1:
message = message_queue.popleft()
if is_debug_log_enabled():
debug("%s: Sending %s", self.description, message)
await send_bytes_text(message)
continue
messages: list[bytes] = [message]
while messages_remaining:
# A None message is used to signal the end of the connection
if (message := message_queue.popleft()) is None:
return
messages.append(message)
messages_remaining -= 1
coalesced_messages = b"".join((b"[", b",".join(messages), b"]"))
if debug_enabled:
coalesced_messages = b"".join((b"[", b",".join(message_queue), b"]"))
message_queue.clear()
if is_debug_log_enabled():
debug("%s: Sending %s", self.description, coalesced_messages)
await send_bytes_text(coalesced_messages)
except asyncio.CancelledError:
@ -197,14 +192,15 @@ class WebSocketHandler:
# max pending messages.
return
if isinstance(message, dict):
message = message_to_json_bytes(message)
elif isinstance(message, str):
message = message.encode("utf-8")
if type(message) is not bytes: # noqa: E721
if isinstance(message, dict):
message = message_to_json_bytes(message)
elif isinstance(message, str):
message = message.encode("utf-8")
message_queue = self._message_queue
queue_size_before_add = len(message_queue)
if queue_size_before_add >= MAX_PENDING_MSG:
message_queue.append(message)
if (queue_size_after_add := len(message_queue)) >= MAX_PENDING_MSG:
self._logger.error(
(
"%s: Client unable to keep up with pending messages. Reached %s pending"
@ -218,14 +214,14 @@ class WebSocketHandler:
self._cancel()
return
message_queue.append(message)
ready_future = self._ready_future
if ready_future and not ready_future.done():
ready_future.set_result(None)
if self._release_ready_queue_size == 0:
# Try to coalesce more messages to reduce the number of writes
self._release_ready_queue_size = queue_size_after_add
self._loop.call_soon(self._release_ready_future_or_reschedule)
peak_checker_active = self._peak_checker_unsub is not None
if queue_size_before_add <= PENDING_MSG_PEAK:
if queue_size_after_add <= PENDING_MSG_PEAK:
if peak_checker_active:
self._cancel_peak_checker()
return
@ -235,6 +231,32 @@ class WebSocketHandler:
self._hass, PENDING_MSG_PEAK_TIME, self._check_write_peak
)
@callback
def _release_ready_future_or_reschedule(self) -> None:
"""Release the ready future or reschedule.
We will release the ready future if the queue did not grow since the
last time we tried to release the ready future.
If we reach PENDING_MSG_MAX_FORCE_READY, we will release the ready future
immediately so avoid the coalesced messages from growing too large.
"""
if not (ready_future := self._ready_future) or not (
queue_size := len(self._message_queue)
):
self._release_ready_queue_size = 0
return
# If we are below the max pending to force ready, and there are new messages
# in the queue since the last time we tried to release the ready future, we
# try again later so we can coalesce more messages.
if queue_size > self._release_ready_queue_size < PENDING_MSG_MAX_FORCE_READY:
self._release_ready_queue_size = queue_size
self._loop.call_soon(self._release_ready_future_or_reschedule)
return
self._release_ready_queue_size = 0
if not ready_future.done():
ready_future.set_result(queue_size)
@callback
def _check_write_peak(self, _utc_time: dt.datetime) -> None:
"""Check that we are no longer above the write peak."""
@ -440,10 +462,8 @@ class WebSocketHandler:
connection.async_handle_close()
self._closing = True
self._message_queue.append(None)
if self._ready_future and not self._ready_future.done():
self._ready_future.set_result(None)
self._ready_future.set_result(len(self._message_queue))
# If the writer gets canceled we still need to close the websocket
# so we have another finally block to make sure we close the websocket