Dynamically size recorder max backlog based on available memory (#90894)

Co-authored-by: Franck Nijhof <git@frenck.dev>
2023-04-18 00:35:49 -10:00 · 2023-04-18 00:35:49 -10:00 · 2ec1359063
commit 2ec1359063
parent f49dc65ff2
9 changed files with 151 additions and 29 deletions
--- a/homeassistant/components/recorder/core.py
+++ b/homeassistant/components/recorder/core.py
@ -11,9 +11,10 @@ import queue
 import sqlite3
 import threading
 import time
-from typing import Any, TypeVar
+from typing import Any, TypeVar, cast

 import async_timeout
+import psutil_home_assistant as ha_psutil
 from sqlalchemy import create_engine, event as sqlalchemy_event, exc, select
 from sqlalchemy.engine import Engine
 from sqlalchemy.engine.interfaces import DBAPIConnection
@ -45,14 +46,16 @@ from .const import (
    CONTEXT_ID_AS_BINARY_SCHEMA_VERSION,
    DB_WORKER_PREFIX,
    DOMAIN,
+    ESTIMATED_QUEUE_ITEM_SIZE,
    EVENT_TYPE_IDS_SCHEMA_VERSION,
    KEEPALIVE_TIME,
    LEGACY_STATES_EVENT_ID_INDEX_SCHEMA_VERSION,
    MARIADB_PYMYSQL_URL_PREFIX,
    MARIADB_URL_PREFIX,
-    MAX_QUEUE_BACKLOG,
+    MAX_QUEUE_BACKLOG_MIN_VALUE,
    MYSQLDB_PYMYSQL_URL_PREFIX,
    MYSQLDB_URL_PREFIX,
+    QUEUE_PERCENTAGE_ALLOWED_AVAILABLE_MEMORY,
    SQLITE_URL_PREFIX,
    STATES_META_SCHEMA_VERSION,
    STATISTICS_ROWS_SCHEMA_VERSION,
@ -148,7 +151,7 @@ WAIT_TASK = WaitTask()
 ADJUST_LRU_SIZE_TASK = AdjustLRUSizeTask()

 DB_LOCK_TIMEOUT = 30
-DB_LOCK_QUEUE_CHECK_TIMEOUT = 1
+DB_LOCK_QUEUE_CHECK_TIMEOUT = 10  # check every 10 seconds


 INVALIDATED_ERR = "Database connection invalidated"
@ -201,6 +204,8 @@ class Recorder(threading.Thread):
        self.async_recorder_ready = asyncio.Event()
        self._queue_watch = threading.Event()
        self.engine: Engine | None = None
+        self.max_backlog: int = MAX_QUEUE_BACKLOG_MIN_VALUE
+        self._psutil: ha_psutil.PsutilWrapper | None = None

        # The entity_filter is exposed on the recorder instance so that
        # it can be used to see if an entity is being recorded and is called
@ -343,7 +348,7 @@ class Recorder(threading.Thread):
        """
        size = self.backlog
        _LOGGER.debug("Recorder queue size is: %s", size)
-        if size <= MAX_QUEUE_BACKLOG:
+        if not self._reached_max_backlog_percentage(100):
            return
        _LOGGER.error(
            (
@ -352,10 +357,33 @@ class Recorder(threading.Thread):
                "is corrupt due to a disk problem; The recorder will stop "
                "recording events to avoid running out of memory"
            ),
-            MAX_QUEUE_BACKLOG,
+            self.backlog,
        )
        self._async_stop_queue_watcher_and_event_listener()

+    def _available_memory(self) -> int:
+        """Return the available memory in bytes."""
+        if not self._psutil:
+            self._psutil = ha_psutil.PsutilWrapper()
+        return cast(int, self._psutil.psutil.virtual_memory().available)
+
+    def _reached_max_backlog_percentage(self, percentage: int) -> bool:
+        """Check if the system has reached the max queue backlog and return the maximum if it has."""
+        percentage_modifier = percentage / 100
+        current_backlog = self.backlog
+        # First check the minimum value since its cheap
+        if current_backlog < (MAX_QUEUE_BACKLOG_MIN_VALUE * percentage_modifier):
+            return False
+        # If they have more RAM available, keep filling the backlog
+        # since we do not want to stop recording events or give the
+        # user a bad backup when they have plenty of RAM available.
+        max_queue_backlog = int(
+            QUEUE_PERCENTAGE_ALLOWED_AVAILABLE_MEMORY
+            * (self._available_memory() / ESTIMATED_QUEUE_ITEM_SIZE)
+        )
+        self.max_backlog = max(max_queue_backlog, MAX_QUEUE_BACKLOG_MIN_VALUE)
+        return current_backlog >= (max_queue_backlog * percentage_modifier)
+
    @callback
    def _async_stop_queue_watcher_and_event_listener(self) -> None:
        """Stop watching the queue and listening for events."""
@ -705,8 +733,8 @@ class Recorder(threading.Thread):
                self.schema_version = SCHEMA_VERSION
                if not self._event_listener:
                    # If the schema migration takes so long that the end
-                    # queue watcher safety kicks in because MAX_QUEUE_BACKLOG
-                    # is reached, we need to reinitialize the listener.
+                    # queue watcher safety kicks in because _reached_max_backlog
+                    # was True, we need to reinitialize the listener.
                    self.hass.add_job(self.async_initialize)
            else:
                persistent_notification.create(
@ -946,12 +974,14 @@ class Recorder(threading.Thread):
            # Notify that lock is being held, wait until database can be used again.
            self.hass.add_job(_async_set_database_locked, task)
            while not task.database_unlock.wait(timeout=DB_LOCK_QUEUE_CHECK_TIMEOUT):
-                if self.backlog > MAX_QUEUE_BACKLOG * 0.9:
+                if self._reached_max_backlog_percentage(90):
                    _LOGGER.warning(
-                        "Database queue backlog reached more than 90% of maximum queue "
+                        "Database queue backlog reached more than %s (%s events) of maximum queue "
                        "length while waiting for backup to finish; recorder will now "
                        "resume writing to database. The backup cannot be trusted and "
-                        "must be restarted"
+                        "must be restarted",
+                        "90%",
+                        self.backlog,
                    )
                    task.queue_overflow = True
                    break