When we hit the absolute limit, we would log the last messages as it was key to finding out the source. We now do the same when we hit the peak limit
403 lines
15 KiB
Python
403 lines
15 KiB
Python
"""View to accept incoming websocket connection."""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from collections import deque
|
|
from collections.abc import Callable
|
|
from contextlib import suppress
|
|
import datetime as dt
|
|
import logging
|
|
from typing import TYPE_CHECKING, Any, Final
|
|
|
|
from aiohttp import WSMsgType, web
|
|
import async_timeout
|
|
|
|
from homeassistant.components.http import HomeAssistantView
|
|
from homeassistant.const import EVENT_HOMEASSISTANT_STOP
|
|
from homeassistant.core import Event, HomeAssistant, callback
|
|
from homeassistant.helpers.dispatcher import async_dispatcher_send
|
|
from homeassistant.helpers.event import async_call_later
|
|
from homeassistant.util.json import json_loads
|
|
|
|
from .auth import AuthPhase, auth_required_message
|
|
from .const import (
|
|
CANCELLATION_ERRORS,
|
|
DATA_CONNECTIONS,
|
|
MAX_PENDING_MSG,
|
|
PENDING_MSG_PEAK,
|
|
PENDING_MSG_PEAK_TIME,
|
|
SIGNAL_WEBSOCKET_CONNECTED,
|
|
SIGNAL_WEBSOCKET_DISCONNECTED,
|
|
URL,
|
|
)
|
|
from .error import Disconnect
|
|
from .messages import message_to_json
|
|
from .util import describe_request
|
|
|
|
if TYPE_CHECKING:
|
|
from .connection import ActiveConnection
|
|
|
|
|
|
_WS_LOGGER: Final = logging.getLogger(f"{__name__}.connection")
|
|
|
|
|
|
class WebsocketAPIView(HomeAssistantView):
|
|
"""View to serve a websockets endpoint."""
|
|
|
|
name: str = "websocketapi"
|
|
url: str = URL
|
|
requires_auth: bool = False
|
|
|
|
async def get(self, request: web.Request) -> web.WebSocketResponse:
|
|
"""Handle an incoming websocket connection."""
|
|
return await WebSocketHandler(request.app["hass"], request).async_handle()
|
|
|
|
|
|
class WebSocketAdapter(logging.LoggerAdapter):
|
|
"""Add connection id to websocket messages."""
|
|
|
|
def process(self, msg: str, kwargs: Any) -> tuple[str, Any]:
|
|
"""Add connid to websocket log messages."""
|
|
if not self.extra or "connid" not in self.extra:
|
|
return msg, kwargs
|
|
return f'[{self.extra["connid"]}] {msg}', kwargs
|
|
|
|
|
|
class WebSocketHandler:
|
|
"""Handle an active websocket client connection."""
|
|
|
|
def __init__(self, hass: HomeAssistant, request: web.Request) -> None:
|
|
"""Initialize an active connection."""
|
|
self.hass = hass
|
|
self.request = request
|
|
self.wsock = web.WebSocketResponse(heartbeat=55)
|
|
self._handle_task: asyncio.Task | None = None
|
|
self._writer_task: asyncio.Task | None = None
|
|
self._closing: bool = False
|
|
self._logger = WebSocketAdapter(_WS_LOGGER, {"connid": id(self)})
|
|
self._peak_checker_unsub: Callable[[], None] | None = None
|
|
self.connection: ActiveConnection | None = None
|
|
|
|
# The WebSocketHandler has a single consumer and path
|
|
# to where messages are queued. This allows the implementation
|
|
# to use a deque and an asyncio.Future to avoid the overhead of
|
|
# an asyncio.Queue.
|
|
self._message_queue: deque = deque()
|
|
self._ready_future: asyncio.Future[None] | None = None
|
|
|
|
@property
|
|
def description(self) -> str:
|
|
"""Return a description of the connection."""
|
|
if self.connection is not None:
|
|
return self.connection.get_description(self.request)
|
|
return describe_request(self.request)
|
|
|
|
async def _writer(self) -> None:
|
|
"""Write outgoing messages."""
|
|
# Variables are set locally to avoid lookups in the loop
|
|
message_queue = self._message_queue
|
|
logger = self._logger
|
|
send_str = self.wsock.send_str
|
|
loop = self.hass.loop
|
|
debug = logger.debug
|
|
# Exceptions if Socket disconnected or cancelled by connection handler
|
|
try:
|
|
with suppress(RuntimeError, ConnectionResetError, *CANCELLATION_ERRORS):
|
|
while not self.wsock.closed:
|
|
if (messages_remaining := len(message_queue)) == 0:
|
|
self._ready_future = loop.create_future()
|
|
await self._ready_future
|
|
messages_remaining = len(message_queue)
|
|
|
|
# A None message is used to signal the end of the connection
|
|
if (process := message_queue.popleft()) is None:
|
|
return
|
|
|
|
messages_remaining -= 1
|
|
message = process if isinstance(process, str) else process()
|
|
|
|
if (
|
|
not messages_remaining
|
|
or not self.connection
|
|
or not self.connection.can_coalesce
|
|
):
|
|
debug("Sending %s", message)
|
|
await send_str(message)
|
|
continue
|
|
|
|
messages: list[str] = [message]
|
|
while messages_remaining:
|
|
# A None message is used to signal the end of the connection
|
|
if (process := message_queue.popleft()) is None:
|
|
return
|
|
messages.append(
|
|
process if isinstance(process, str) else process()
|
|
)
|
|
messages_remaining -= 1
|
|
|
|
coalesced_messages = "[" + ",".join(messages) + "]"
|
|
debug("Sending %s", coalesced_messages)
|
|
await send_str(coalesced_messages)
|
|
finally:
|
|
# Clean up the peak checker when we shut down the writer
|
|
self._cancel_peak_checker()
|
|
|
|
@callback
|
|
def _cancel_peak_checker(self) -> None:
|
|
"""Cancel the peak checker."""
|
|
if self._peak_checker_unsub is not None:
|
|
self._peak_checker_unsub()
|
|
self._peak_checker_unsub = None
|
|
|
|
@callback
|
|
def _send_message(self, message: str | dict[str, Any] | Callable[[], str]) -> None:
|
|
"""Send a message to the client.
|
|
|
|
Closes connection if the client is not reading the messages.
|
|
|
|
Async friendly.
|
|
"""
|
|
if self._closing:
|
|
# Connection is cancelled, don't flood logs about exceeding
|
|
# max pending messages.
|
|
return
|
|
|
|
if isinstance(message, dict):
|
|
message = message_to_json(message)
|
|
|
|
message_queue = self._message_queue
|
|
queue_size_before_add = len(message_queue)
|
|
if queue_size_before_add >= MAX_PENDING_MSG:
|
|
self._logger.error(
|
|
(
|
|
"%s: Client unable to keep up with pending messages. Reached %s pending"
|
|
" messages. The system's load is too high or an integration is"
|
|
" misbehaving; Last message was: %s"
|
|
),
|
|
self.description,
|
|
MAX_PENDING_MSG,
|
|
message,
|
|
)
|
|
self._cancel()
|
|
return
|
|
|
|
message_queue.append(message)
|
|
if self._ready_future and not self._ready_future.done():
|
|
self._ready_future.set_result(None)
|
|
|
|
peak_checker_active = self._peak_checker_unsub is not None
|
|
|
|
if queue_size_before_add <= PENDING_MSG_PEAK:
|
|
if peak_checker_active:
|
|
self._cancel_peak_checker()
|
|
return
|
|
|
|
if not peak_checker_active:
|
|
self._peak_checker_unsub = async_call_later(
|
|
self.hass, PENDING_MSG_PEAK_TIME, self._check_write_peak
|
|
)
|
|
|
|
@callback
|
|
def _check_write_peak(self, _utc_time: dt.datetime) -> None:
|
|
"""Check that we are no longer above the write peak."""
|
|
self._peak_checker_unsub = None
|
|
|
|
if len(self._message_queue) < PENDING_MSG_PEAK:
|
|
return
|
|
|
|
self._logger.error(
|
|
(
|
|
"%s: Client unable to keep up with pending messages. Stayed over %s for %s"
|
|
" seconds. The system's load is too high or an integration is"
|
|
" misbehaving; Last message was: %s"
|
|
),
|
|
self.description,
|
|
PENDING_MSG_PEAK,
|
|
PENDING_MSG_PEAK_TIME,
|
|
self._message_queue[-1],
|
|
)
|
|
self._cancel()
|
|
|
|
@callback
|
|
def _cancel(self) -> None:
|
|
"""Cancel the connection."""
|
|
self._closing = True
|
|
self._cancel_peak_checker()
|
|
if self._handle_task is not None:
|
|
self._handle_task.cancel()
|
|
if self._writer_task is not None:
|
|
self._writer_task.cancel()
|
|
|
|
async def async_handle(self) -> web.WebSocketResponse:
|
|
"""Handle a websocket response."""
|
|
request = self.request
|
|
wsock = self.wsock
|
|
try:
|
|
async with async_timeout.timeout(10):
|
|
await wsock.prepare(request)
|
|
except asyncio.TimeoutError:
|
|
self._logger.warning("Timeout preparing request from %s", request.remote)
|
|
return wsock
|
|
|
|
self._logger.debug("Connected from %s", request.remote)
|
|
self._handle_task = asyncio.current_task()
|
|
|
|
@callback
|
|
def handle_hass_stop(event: Event) -> None:
|
|
"""Cancel this connection."""
|
|
self._cancel()
|
|
|
|
unsub_stop = self.hass.bus.async_listen(
|
|
EVENT_HOMEASSISTANT_STOP, handle_hass_stop
|
|
)
|
|
|
|
# As the webserver is now started before the start
|
|
# event we do not want to block for websocket responses
|
|
self._writer_task = asyncio.create_task(self._writer())
|
|
|
|
auth = AuthPhase(
|
|
self._logger, self.hass, self._send_message, self._cancel, request
|
|
)
|
|
connection = None
|
|
disconnect_warn = None
|
|
|
|
try:
|
|
self._send_message(auth_required_message())
|
|
|
|
# Auth Phase
|
|
try:
|
|
async with async_timeout.timeout(10):
|
|
msg = await wsock.receive()
|
|
except asyncio.TimeoutError as err:
|
|
disconnect_warn = "Did not receive auth message within 10 seconds"
|
|
raise Disconnect from err
|
|
|
|
if msg.type in (WSMsgType.CLOSE, WSMsgType.CLOSED, WSMsgType.CLOSING):
|
|
raise Disconnect
|
|
|
|
if msg.type != WSMsgType.TEXT:
|
|
disconnect_warn = "Received non-Text message."
|
|
raise Disconnect
|
|
|
|
try:
|
|
msg_data = msg.json(loads=json_loads)
|
|
except ValueError as err:
|
|
disconnect_warn = "Received invalid JSON."
|
|
raise Disconnect from err
|
|
|
|
self._logger.debug("Received %s", msg_data)
|
|
self.connection = connection = await auth.async_handle(msg_data)
|
|
self.hass.data[DATA_CONNECTIONS] = (
|
|
self.hass.data.get(DATA_CONNECTIONS, 0) + 1
|
|
)
|
|
async_dispatcher_send(self.hass, SIGNAL_WEBSOCKET_CONNECTED)
|
|
|
|
#
|
|
#
|
|
# Our websocket implementation is backed by an asyncio.Queue
|
|
#
|
|
# As back-pressure builds, the queue will back up and use more memory
|
|
# until we disconnect the client when the queue size reaches
|
|
# MAX_PENDING_MSG. When we are generating a high volume of websocket messages,
|
|
# we hit a bottleneck in aiohttp where it will wait for
|
|
# the buffer to drain before sending the next message and messages
|
|
# start backing up in the queue.
|
|
#
|
|
# https://github.com/aio-libs/aiohttp/issues/1367 added drains
|
|
# to the websocket writer to handle malicious clients and network issues.
|
|
# The drain causes multiple problems for us since the buffer cannot be
|
|
# drained fast enough when we deliver a high volume or large messages:
|
|
#
|
|
# - We end up disconnecting the client. The client will then reconnect,
|
|
# and the cycle repeats itself, which results in a significant amount of
|
|
# CPU usage.
|
|
#
|
|
# - Messages latency increases because messages cannot be moved into
|
|
# the TCP buffer because it is blocked waiting for the drain to happen because
|
|
# of the low default limit of 16KiB. By increasing the limit, we instead
|
|
# rely on the underlying TCP buffer and stack to deliver the messages which
|
|
# can typically happen much faster.
|
|
#
|
|
# After the auth phase is completed, and we are not concerned about
|
|
# the user being a malicious client, we set the limit to force a drain
|
|
# to 1MiB. 1MiB is the maximum expected size of the serialized entity
|
|
# registry, which is the largest message we usually send.
|
|
#
|
|
# https://github.com/aio-libs/aiohttp/commit/b3c80ee3f7d5d8f0b8bc27afe52e4d46621eaf99
|
|
# added a way to set the limit, but there is no way to actually
|
|
# reach the code to set the limit, so we have to set it directly.
|
|
#
|
|
wsock._writer._limit = 2**20 # type: ignore[union-attr] # pylint: disable=protected-access
|
|
|
|
# Command phase
|
|
while not wsock.closed:
|
|
msg = await wsock.receive()
|
|
|
|
if msg.type in (WSMsgType.CLOSE, WSMsgType.CLOSED, WSMsgType.CLOSING):
|
|
break
|
|
|
|
if msg.type == WSMsgType.BINARY:
|
|
if len(msg.data) < 1:
|
|
disconnect_warn = "Received invalid binary message."
|
|
break
|
|
handler = msg.data[0]
|
|
payload = msg.data[1:]
|
|
connection.async_handle_binary(handler, payload)
|
|
continue
|
|
|
|
if msg.type != WSMsgType.TEXT:
|
|
disconnect_warn = "Received non-Text message."
|
|
break
|
|
|
|
try:
|
|
msg_data = msg.json(loads=json_loads)
|
|
except ValueError:
|
|
disconnect_warn = "Received invalid JSON."
|
|
break
|
|
|
|
self._logger.debug("Received %s", msg_data)
|
|
if not isinstance(msg_data, list):
|
|
connection.async_handle(msg_data)
|
|
continue
|
|
|
|
for split_msg in msg_data:
|
|
connection.async_handle(split_msg)
|
|
|
|
except asyncio.CancelledError:
|
|
self._logger.info("Connection closed by client")
|
|
|
|
except Disconnect:
|
|
pass
|
|
|
|
except Exception: # pylint: disable=broad-except
|
|
self._logger.exception("Unexpected error inside websocket API")
|
|
|
|
finally:
|
|
unsub_stop()
|
|
|
|
if connection is not None:
|
|
connection.async_handle_close()
|
|
|
|
self._closing = True
|
|
|
|
self._message_queue.append(None)
|
|
if self._ready_future and not self._ready_future.done():
|
|
self._ready_future.set_result(None)
|
|
|
|
try:
|
|
# Make sure all error messages are written before closing
|
|
await self._writer_task
|
|
await wsock.close()
|
|
finally:
|
|
if disconnect_warn is None:
|
|
self._logger.debug("Disconnected")
|
|
else:
|
|
self._logger.warning("Disconnected: %s", disconnect_warn)
|
|
|
|
if connection is not None:
|
|
self.hass.data[DATA_CONNECTIONS] -= 1
|
|
self.connection = None
|
|
|
|
async_dispatcher_send(self.hass, SIGNAL_WEBSOCKET_DISCONNECTED)
|
|
|
|
return wsock
|