Dynamically size recorder max backlog based on available memory (#90894)
Co-authored-by: Franck Nijhof <git@frenck.dev>
This commit is contained in:
parent
f49dc65ff2
commit
2ec1359063
9 changed files with 151 additions and 29 deletions
|
@ -11,9 +11,10 @@ import queue
|
|||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from typing import Any, TypeVar
|
||||
from typing import Any, TypeVar, cast
|
||||
|
||||
import async_timeout
|
||||
import psutil_home_assistant as ha_psutil
|
||||
from sqlalchemy import create_engine, event as sqlalchemy_event, exc, select
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlalchemy.engine.interfaces import DBAPIConnection
|
||||
|
@ -45,14 +46,16 @@ from .const import (
|
|||
CONTEXT_ID_AS_BINARY_SCHEMA_VERSION,
|
||||
DB_WORKER_PREFIX,
|
||||
DOMAIN,
|
||||
ESTIMATED_QUEUE_ITEM_SIZE,
|
||||
EVENT_TYPE_IDS_SCHEMA_VERSION,
|
||||
KEEPALIVE_TIME,
|
||||
LEGACY_STATES_EVENT_ID_INDEX_SCHEMA_VERSION,
|
||||
MARIADB_PYMYSQL_URL_PREFIX,
|
||||
MARIADB_URL_PREFIX,
|
||||
MAX_QUEUE_BACKLOG,
|
||||
MAX_QUEUE_BACKLOG_MIN_VALUE,
|
||||
MYSQLDB_PYMYSQL_URL_PREFIX,
|
||||
MYSQLDB_URL_PREFIX,
|
||||
QUEUE_PERCENTAGE_ALLOWED_AVAILABLE_MEMORY,
|
||||
SQLITE_URL_PREFIX,
|
||||
STATES_META_SCHEMA_VERSION,
|
||||
STATISTICS_ROWS_SCHEMA_VERSION,
|
||||
|
@ -148,7 +151,7 @@ WAIT_TASK = WaitTask()
|
|||
ADJUST_LRU_SIZE_TASK = AdjustLRUSizeTask()
|
||||
|
||||
DB_LOCK_TIMEOUT = 30
|
||||
DB_LOCK_QUEUE_CHECK_TIMEOUT = 1
|
||||
DB_LOCK_QUEUE_CHECK_TIMEOUT = 10 # check every 10 seconds
|
||||
|
||||
|
||||
INVALIDATED_ERR = "Database connection invalidated"
|
||||
|
@ -201,6 +204,8 @@ class Recorder(threading.Thread):
|
|||
self.async_recorder_ready = asyncio.Event()
|
||||
self._queue_watch = threading.Event()
|
||||
self.engine: Engine | None = None
|
||||
self.max_backlog: int = MAX_QUEUE_BACKLOG_MIN_VALUE
|
||||
self._psutil: ha_psutil.PsutilWrapper | None = None
|
||||
|
||||
# The entity_filter is exposed on the recorder instance so that
|
||||
# it can be used to see if an entity is being recorded and is called
|
||||
|
@ -343,7 +348,7 @@ class Recorder(threading.Thread):
|
|||
"""
|
||||
size = self.backlog
|
||||
_LOGGER.debug("Recorder queue size is: %s", size)
|
||||
if size <= MAX_QUEUE_BACKLOG:
|
||||
if not self._reached_max_backlog_percentage(100):
|
||||
return
|
||||
_LOGGER.error(
|
||||
(
|
||||
|
@ -352,10 +357,33 @@ class Recorder(threading.Thread):
|
|||
"is corrupt due to a disk problem; The recorder will stop "
|
||||
"recording events to avoid running out of memory"
|
||||
),
|
||||
MAX_QUEUE_BACKLOG,
|
||||
self.backlog,
|
||||
)
|
||||
self._async_stop_queue_watcher_and_event_listener()
|
||||
|
||||
def _available_memory(self) -> int:
|
||||
"""Return the available memory in bytes."""
|
||||
if not self._psutil:
|
||||
self._psutil = ha_psutil.PsutilWrapper()
|
||||
return cast(int, self._psutil.psutil.virtual_memory().available)
|
||||
|
||||
def _reached_max_backlog_percentage(self, percentage: int) -> bool:
|
||||
"""Check if the system has reached the max queue backlog and return the maximum if it has."""
|
||||
percentage_modifier = percentage / 100
|
||||
current_backlog = self.backlog
|
||||
# First check the minimum value since its cheap
|
||||
if current_backlog < (MAX_QUEUE_BACKLOG_MIN_VALUE * percentage_modifier):
|
||||
return False
|
||||
# If they have more RAM available, keep filling the backlog
|
||||
# since we do not want to stop recording events or give the
|
||||
# user a bad backup when they have plenty of RAM available.
|
||||
max_queue_backlog = int(
|
||||
QUEUE_PERCENTAGE_ALLOWED_AVAILABLE_MEMORY
|
||||
* (self._available_memory() / ESTIMATED_QUEUE_ITEM_SIZE)
|
||||
)
|
||||
self.max_backlog = max(max_queue_backlog, MAX_QUEUE_BACKLOG_MIN_VALUE)
|
||||
return current_backlog >= (max_queue_backlog * percentage_modifier)
|
||||
|
||||
@callback
|
||||
def _async_stop_queue_watcher_and_event_listener(self) -> None:
|
||||
"""Stop watching the queue and listening for events."""
|
||||
|
@ -705,8 +733,8 @@ class Recorder(threading.Thread):
|
|||
self.schema_version = SCHEMA_VERSION
|
||||
if not self._event_listener:
|
||||
# If the schema migration takes so long that the end
|
||||
# queue watcher safety kicks in because MAX_QUEUE_BACKLOG
|
||||
# is reached, we need to reinitialize the listener.
|
||||
# queue watcher safety kicks in because _reached_max_backlog
|
||||
# was True, we need to reinitialize the listener.
|
||||
self.hass.add_job(self.async_initialize)
|
||||
else:
|
||||
persistent_notification.create(
|
||||
|
@ -946,12 +974,14 @@ class Recorder(threading.Thread):
|
|||
# Notify that lock is being held, wait until database can be used again.
|
||||
self.hass.add_job(_async_set_database_locked, task)
|
||||
while not task.database_unlock.wait(timeout=DB_LOCK_QUEUE_CHECK_TIMEOUT):
|
||||
if self.backlog > MAX_QUEUE_BACKLOG * 0.9:
|
||||
if self._reached_max_backlog_percentage(90):
|
||||
_LOGGER.warning(
|
||||
"Database queue backlog reached more than 90% of maximum queue "
|
||||
"Database queue backlog reached more than %s (%s events) of maximum queue "
|
||||
"length while waiting for backup to finish; recorder will now "
|
||||
"resume writing to database. The backup cannot be trusted and "
|
||||
"must be restarted"
|
||||
"must be restarted",
|
||||
"90%",
|
||||
self.backlog,
|
||||
)
|
||||
task.queue_overflow = True
|
||||
break
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue