Dynamically size recorder max backlog based on available memory (#90894)

Co-authored-by: Franck Nijhof <git@frenck.dev>
This commit is contained in:
J. Nick Koston 2023-04-18 00:35:49 -10:00 committed by GitHub
parent f49dc65ff2
commit 2ec1359063
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 151 additions and 29 deletions

View file

@ -11,9 +11,10 @@ import queue
import sqlite3
import threading
import time
from typing import Any, TypeVar
from typing import Any, TypeVar, cast
import async_timeout
import psutil_home_assistant as ha_psutil
from sqlalchemy import create_engine, event as sqlalchemy_event, exc, select
from sqlalchemy.engine import Engine
from sqlalchemy.engine.interfaces import DBAPIConnection
@ -45,14 +46,16 @@ from .const import (
CONTEXT_ID_AS_BINARY_SCHEMA_VERSION,
DB_WORKER_PREFIX,
DOMAIN,
ESTIMATED_QUEUE_ITEM_SIZE,
EVENT_TYPE_IDS_SCHEMA_VERSION,
KEEPALIVE_TIME,
LEGACY_STATES_EVENT_ID_INDEX_SCHEMA_VERSION,
MARIADB_PYMYSQL_URL_PREFIX,
MARIADB_URL_PREFIX,
MAX_QUEUE_BACKLOG,
MAX_QUEUE_BACKLOG_MIN_VALUE,
MYSQLDB_PYMYSQL_URL_PREFIX,
MYSQLDB_URL_PREFIX,
QUEUE_PERCENTAGE_ALLOWED_AVAILABLE_MEMORY,
SQLITE_URL_PREFIX,
STATES_META_SCHEMA_VERSION,
STATISTICS_ROWS_SCHEMA_VERSION,
@ -148,7 +151,7 @@ WAIT_TASK = WaitTask()
ADJUST_LRU_SIZE_TASK = AdjustLRUSizeTask()
DB_LOCK_TIMEOUT = 30
DB_LOCK_QUEUE_CHECK_TIMEOUT = 1
DB_LOCK_QUEUE_CHECK_TIMEOUT = 10 # check every 10 seconds
INVALIDATED_ERR = "Database connection invalidated"
@ -201,6 +204,8 @@ class Recorder(threading.Thread):
self.async_recorder_ready = asyncio.Event()
self._queue_watch = threading.Event()
self.engine: Engine | None = None
self.max_backlog: int = MAX_QUEUE_BACKLOG_MIN_VALUE
self._psutil: ha_psutil.PsutilWrapper | None = None
# The entity_filter is exposed on the recorder instance so that
# it can be used to see if an entity is being recorded and is called
@ -343,7 +348,7 @@ class Recorder(threading.Thread):
"""
size = self.backlog
_LOGGER.debug("Recorder queue size is: %s", size)
if size <= MAX_QUEUE_BACKLOG:
if not self._reached_max_backlog_percentage(100):
return
_LOGGER.error(
(
@ -352,10 +357,33 @@ class Recorder(threading.Thread):
"is corrupt due to a disk problem; The recorder will stop "
"recording events to avoid running out of memory"
),
MAX_QUEUE_BACKLOG,
self.backlog,
)
self._async_stop_queue_watcher_and_event_listener()
def _available_memory(self) -> int:
"""Return the available memory in bytes."""
if not self._psutil:
self._psutil = ha_psutil.PsutilWrapper()
return cast(int, self._psutil.psutil.virtual_memory().available)
def _reached_max_backlog_percentage(self, percentage: int) -> bool:
"""Check if the system has reached the max queue backlog and return the maximum if it has."""
percentage_modifier = percentage / 100
current_backlog = self.backlog
# First check the minimum value since its cheap
if current_backlog < (MAX_QUEUE_BACKLOG_MIN_VALUE * percentage_modifier):
return False
# If they have more RAM available, keep filling the backlog
# since we do not want to stop recording events or give the
# user a bad backup when they have plenty of RAM available.
max_queue_backlog = int(
QUEUE_PERCENTAGE_ALLOWED_AVAILABLE_MEMORY
* (self._available_memory() / ESTIMATED_QUEUE_ITEM_SIZE)
)
self.max_backlog = max(max_queue_backlog, MAX_QUEUE_BACKLOG_MIN_VALUE)
return current_backlog >= (max_queue_backlog * percentage_modifier)
@callback
def _async_stop_queue_watcher_and_event_listener(self) -> None:
"""Stop watching the queue and listening for events."""
@ -705,8 +733,8 @@ class Recorder(threading.Thread):
self.schema_version = SCHEMA_VERSION
if not self._event_listener:
# If the schema migration takes so long that the end
# queue watcher safety kicks in because MAX_QUEUE_BACKLOG
# is reached, we need to reinitialize the listener.
# queue watcher safety kicks in because _reached_max_backlog
# was True, we need to reinitialize the listener.
self.hass.add_job(self.async_initialize)
else:
persistent_notification.create(
@ -946,12 +974,14 @@ class Recorder(threading.Thread):
# Notify that lock is being held, wait until database can be used again.
self.hass.add_job(_async_set_database_locked, task)
while not task.database_unlock.wait(timeout=DB_LOCK_QUEUE_CHECK_TIMEOUT):
if self.backlog > MAX_QUEUE_BACKLOG * 0.9:
if self._reached_max_backlog_percentage(90):
_LOGGER.warning(
"Database queue backlog reached more than 90% of maximum queue "
"Database queue backlog reached more than %s (%s events) of maximum queue "
"length while waiting for backup to finish; recorder will now "
"resume writing to database. The backup cannot be trusted and "
"must be restarted"
"must be restarted",
"90%",
self.backlog,
)
task.queue_overflow = True
break