* Fix cpu thrashing during purge after all legacy events were removed We now remove the the index of of event ids on the states table when its all NULLs to save space. The purge path needs to avoid checking for legacy rows to purge if the index has been removed since it will result in a full table scan each purge cycle that will always find no legacy rows to purge * one more place * drop the key constraint as well * fixes * more sqlite
726 lines
26 KiB
Python
726 lines
26 KiB
Python
"""Purge old data helper."""
|
|
from __future__ import annotations
|
|
|
|
from collections.abc import Callable
|
|
from datetime import datetime
|
|
from itertools import zip_longest
|
|
import logging
|
|
import time
|
|
from typing import TYPE_CHECKING
|
|
|
|
from sqlalchemy.orm.session import Session
|
|
|
|
import homeassistant.util.dt as dt_util
|
|
|
|
from .const import SQLITE_MAX_BIND_VARS
|
|
from .db_schema import Events, States, StatesMeta
|
|
from .models import DatabaseEngine
|
|
from .queries import (
|
|
attributes_ids_exist_in_states,
|
|
attributes_ids_exist_in_states_with_fast_in_distinct,
|
|
data_ids_exist_in_events,
|
|
data_ids_exist_in_events_with_fast_in_distinct,
|
|
delete_event_data_rows,
|
|
delete_event_rows,
|
|
delete_event_types_rows,
|
|
delete_recorder_runs_rows,
|
|
delete_states_attributes_rows,
|
|
delete_states_meta_rows,
|
|
delete_states_rows,
|
|
delete_statistics_runs_rows,
|
|
delete_statistics_short_term_rows,
|
|
disconnect_states_rows,
|
|
find_entity_ids_to_purge,
|
|
find_event_types_to_purge,
|
|
find_events_to_purge,
|
|
find_latest_statistics_runs_run_id,
|
|
find_legacy_event_state_and_attributes_and_data_ids_to_purge,
|
|
find_legacy_row,
|
|
find_short_term_statistics_to_purge,
|
|
find_states_to_purge,
|
|
find_statistics_runs_to_purge,
|
|
)
|
|
from .repack import repack_database
|
|
from .util import chunked, retryable_database_job, session_scope
|
|
|
|
if TYPE_CHECKING:
|
|
from . import Recorder
|
|
|
|
_LOGGER = logging.getLogger(__name__)
|
|
|
|
|
|
DEFAULT_STATES_BATCHES_PER_PURGE = 20 # We expect ~95% de-dupe rate
|
|
DEFAULT_EVENTS_BATCHES_PER_PURGE = 15 # We expect ~92% de-dupe rate
|
|
|
|
|
|
@retryable_database_job("purge")
|
|
def purge_old_data(
|
|
instance: Recorder,
|
|
purge_before: datetime,
|
|
repack: bool,
|
|
apply_filter: bool = False,
|
|
events_batch_size: int = DEFAULT_EVENTS_BATCHES_PER_PURGE,
|
|
states_batch_size: int = DEFAULT_STATES_BATCHES_PER_PURGE,
|
|
) -> bool:
|
|
"""Purge events and states older than purge_before.
|
|
|
|
Cleans up an timeframe of an hour, based on the oldest record.
|
|
"""
|
|
_LOGGER.debug(
|
|
"Purging states and events before target %s",
|
|
purge_before.isoformat(sep=" ", timespec="seconds"),
|
|
)
|
|
with session_scope(session=instance.get_session()) as session:
|
|
# Purge a max of SQLITE_MAX_BIND_VARS, based on the oldest states or events record
|
|
has_more_to_purge = False
|
|
if instance.use_legacy_events_index and _purging_legacy_format(session):
|
|
_LOGGER.debug(
|
|
"Purge running in legacy format as there are states with event_id"
|
|
" remaining"
|
|
)
|
|
has_more_to_purge |= _purge_legacy_format(instance, session, purge_before)
|
|
else:
|
|
_LOGGER.debug(
|
|
"Purge running in new format as there are NO states with event_id"
|
|
" remaining"
|
|
)
|
|
# Once we are done purging legacy rows, we use the new method
|
|
has_more_to_purge |= _purge_states_and_attributes_ids(
|
|
instance, session, states_batch_size, purge_before
|
|
)
|
|
has_more_to_purge |= _purge_events_and_data_ids(
|
|
instance, session, events_batch_size, purge_before
|
|
)
|
|
|
|
statistics_runs = _select_statistics_runs_to_purge(session, purge_before)
|
|
short_term_statistics = _select_short_term_statistics_to_purge(
|
|
session, purge_before
|
|
)
|
|
if statistics_runs:
|
|
_purge_statistics_runs(session, statistics_runs)
|
|
|
|
if short_term_statistics:
|
|
_purge_short_term_statistics(session, short_term_statistics)
|
|
|
|
if has_more_to_purge or statistics_runs or short_term_statistics:
|
|
# Return false, as we might not be done yet.
|
|
_LOGGER.debug("Purging hasn't fully completed yet")
|
|
return False
|
|
|
|
if apply_filter and _purge_filtered_data(instance, session) is False:
|
|
_LOGGER.debug("Cleanup filtered data hasn't fully completed yet")
|
|
return False
|
|
|
|
# This purge cycle is finished, clean up old event types and
|
|
# recorder runs
|
|
if instance.event_type_manager.active:
|
|
_purge_old_event_types(instance, session)
|
|
|
|
if instance.states_meta_manager.active:
|
|
_purge_old_entity_ids(instance, session)
|
|
|
|
_purge_old_recorder_runs(instance, session, purge_before)
|
|
if repack:
|
|
repack_database(instance)
|
|
return True
|
|
|
|
|
|
def _purging_legacy_format(session: Session) -> bool:
|
|
"""Check if there are any legacy event_id linked states rows remaining."""
|
|
return bool(session.execute(find_legacy_row()).scalar())
|
|
|
|
|
|
def _purge_legacy_format(
|
|
instance: Recorder, session: Session, purge_before: datetime
|
|
) -> bool:
|
|
"""Purge rows that are still linked by the event_ids."""
|
|
(
|
|
event_ids,
|
|
state_ids,
|
|
attributes_ids,
|
|
data_ids,
|
|
) = _select_legacy_event_state_and_attributes_and_data_ids_to_purge(
|
|
session, purge_before
|
|
)
|
|
_purge_state_ids(instance, session, state_ids)
|
|
_purge_unused_attributes_ids(instance, session, attributes_ids)
|
|
_purge_event_ids(session, event_ids)
|
|
_purge_unused_data_ids(instance, session, data_ids)
|
|
return bool(event_ids or state_ids or attributes_ids or data_ids)
|
|
|
|
|
|
def _purge_states_and_attributes_ids(
|
|
instance: Recorder,
|
|
session: Session,
|
|
states_batch_size: int,
|
|
purge_before: datetime,
|
|
) -> bool:
|
|
"""Purge states and linked attributes id in a batch.
|
|
|
|
Returns true if there are more states to purge.
|
|
"""
|
|
database_engine = instance.database_engine
|
|
assert database_engine is not None
|
|
has_remaining_state_ids_to_purge = True
|
|
# There are more states relative to attributes_ids so
|
|
# we purge enough state_ids to try to generate a full
|
|
# size batch of attributes_ids that will be around the size
|
|
# SQLITE_MAX_BIND_VARS
|
|
attributes_ids_batch: set[int] = set()
|
|
for _ in range(states_batch_size):
|
|
state_ids, attributes_ids = _select_state_attributes_ids_to_purge(
|
|
session, purge_before
|
|
)
|
|
if not state_ids:
|
|
has_remaining_state_ids_to_purge = False
|
|
break
|
|
_purge_state_ids(instance, session, state_ids)
|
|
attributes_ids_batch = attributes_ids_batch | attributes_ids
|
|
|
|
_purge_unused_attributes_ids(instance, session, attributes_ids_batch)
|
|
_LOGGER.debug(
|
|
"After purging states and attributes_ids remaining=%s",
|
|
has_remaining_state_ids_to_purge,
|
|
)
|
|
return has_remaining_state_ids_to_purge
|
|
|
|
|
|
def _purge_events_and_data_ids(
|
|
instance: Recorder,
|
|
session: Session,
|
|
events_batch_size: int,
|
|
purge_before: datetime,
|
|
) -> bool:
|
|
"""Purge states and linked attributes id in a batch.
|
|
|
|
Returns true if there are more states to purge.
|
|
"""
|
|
has_remaining_event_ids_to_purge = True
|
|
# There are more events relative to data_ids so
|
|
# we purge enough event_ids to try to generate a full
|
|
# size batch of data_ids that will be around the size
|
|
# SQLITE_MAX_BIND_VARS
|
|
data_ids_batch: set[int] = set()
|
|
for _ in range(events_batch_size):
|
|
event_ids, data_ids = _select_event_data_ids_to_purge(session, purge_before)
|
|
if not event_ids:
|
|
has_remaining_event_ids_to_purge = False
|
|
break
|
|
_purge_event_ids(session, event_ids)
|
|
data_ids_batch = data_ids_batch | data_ids
|
|
|
|
_purge_unused_data_ids(instance, session, data_ids_batch)
|
|
_LOGGER.debug(
|
|
"After purging event and data_ids remaining=%s",
|
|
has_remaining_event_ids_to_purge,
|
|
)
|
|
return has_remaining_event_ids_to_purge
|
|
|
|
|
|
def _select_state_attributes_ids_to_purge(
|
|
session: Session, purge_before: datetime
|
|
) -> tuple[set[int], set[int]]:
|
|
"""Return sets of state and attribute ids to purge."""
|
|
state_ids = set()
|
|
attributes_ids = set()
|
|
for state in session.execute(
|
|
find_states_to_purge(dt_util.utc_to_timestamp(purge_before))
|
|
).all():
|
|
state_ids.add(state.state_id)
|
|
if state.attributes_id:
|
|
attributes_ids.add(state.attributes_id)
|
|
_LOGGER.debug(
|
|
"Selected %s state ids and %s attributes_ids to remove",
|
|
len(state_ids),
|
|
len(attributes_ids),
|
|
)
|
|
return state_ids, attributes_ids
|
|
|
|
|
|
def _select_event_data_ids_to_purge(
|
|
session: Session, purge_before: datetime
|
|
) -> tuple[set[int], set[int]]:
|
|
"""Return sets of event and data ids to purge."""
|
|
event_ids = set()
|
|
data_ids = set()
|
|
for event in session.execute(
|
|
find_events_to_purge(dt_util.utc_to_timestamp(purge_before))
|
|
).all():
|
|
event_ids.add(event.event_id)
|
|
if event.data_id:
|
|
data_ids.add(event.data_id)
|
|
_LOGGER.debug(
|
|
"Selected %s event ids and %s data_ids to remove", len(event_ids), len(data_ids)
|
|
)
|
|
return event_ids, data_ids
|
|
|
|
|
|
def _select_unused_attributes_ids(
|
|
session: Session, attributes_ids: set[int], database_engine: DatabaseEngine
|
|
) -> set[int]:
|
|
"""Return a set of attributes ids that are not used by any states in the db."""
|
|
if not attributes_ids:
|
|
return set()
|
|
|
|
if not database_engine.optimizer.slow_range_in_select:
|
|
#
|
|
# SQLite has a superior query optimizer for the distinct query below as it uses
|
|
# the covering index without having to examine the rows directly for both of the
|
|
# queries below.
|
|
#
|
|
# We use the distinct query for SQLite since the query in the other branch can
|
|
# generate more than 500 unions which SQLite does not support.
|
|
#
|
|
# How MariaDB's query optimizer handles this query:
|
|
# > explain select distinct attributes_id from states where attributes_id in
|
|
# (136723);
|
|
# ...Using index
|
|
#
|
|
seen_ids = {
|
|
state[0]
|
|
for state in session.execute(
|
|
attributes_ids_exist_in_states_with_fast_in_distinct(attributes_ids)
|
|
).all()
|
|
}
|
|
else:
|
|
#
|
|
# This branch is for DBMS that cannot optimize the distinct query well and has
|
|
# to examine all the rows that match.
|
|
#
|
|
# This branch uses a union of simple queries, as each query is optimized away
|
|
# as the answer to the query can be found in the index.
|
|
#
|
|
# The below query works for SQLite as long as there are no more than 500
|
|
# attributes_id to be selected. We currently do not have MySQL or PostgreSQL
|
|
# servers running in the test suite; we test this path using SQLite when there
|
|
# are less than 500 attributes_id.
|
|
#
|
|
# How MariaDB's query optimizer handles this query:
|
|
# > explain select min(attributes_id) from states where attributes_id = 136723;
|
|
# ...Select tables optimized away
|
|
#
|
|
# We used to generate a query based on how many attribute_ids to find but
|
|
# that meant sqlalchemy Transparent SQL Compilation Caching was working against
|
|
# us by cached up to SQLITE_MAX_BIND_VARS different statements which could be
|
|
# up to 500MB for large database due to the complexity of the ORM objects.
|
|
#
|
|
# We now break the query into groups of 100 and use a lambda_stmt to ensure
|
|
# that the query is only cached once.
|
|
#
|
|
seen_ids = set()
|
|
groups = [iter(attributes_ids)] * 100
|
|
for attr_ids in zip_longest(*groups, fillvalue=None):
|
|
seen_ids |= {
|
|
attrs_id[0]
|
|
for attrs_id in session.execute(
|
|
attributes_ids_exist_in_states(*attr_ids) # type: ignore[arg-type]
|
|
).all()
|
|
if attrs_id[0] is not None
|
|
}
|
|
to_remove = attributes_ids - seen_ids
|
|
_LOGGER.debug(
|
|
"Selected %s shared attributes to remove",
|
|
len(to_remove),
|
|
)
|
|
return to_remove
|
|
|
|
|
|
def _purge_unused_attributes_ids(
|
|
instance: Recorder,
|
|
session: Session,
|
|
attributes_ids_batch: set[int],
|
|
) -> None:
|
|
"""Purge unused attributes ids."""
|
|
database_engine = instance.database_engine
|
|
assert database_engine is not None
|
|
if unused_attribute_ids_set := _select_unused_attributes_ids(
|
|
session, attributes_ids_batch, database_engine
|
|
):
|
|
_purge_batch_attributes_ids(instance, session, unused_attribute_ids_set)
|
|
|
|
|
|
def _select_unused_event_data_ids(
|
|
session: Session, data_ids: set[int], database_engine: DatabaseEngine
|
|
) -> set[int]:
|
|
"""Return a set of event data ids that are not used by any events in the db."""
|
|
if not data_ids:
|
|
return set()
|
|
|
|
# See _select_unused_attributes_ids for why this function
|
|
# branches for non-sqlite databases.
|
|
if not database_engine.optimizer.slow_range_in_select:
|
|
seen_ids = {
|
|
state[0]
|
|
for state in session.execute(
|
|
data_ids_exist_in_events_with_fast_in_distinct(data_ids)
|
|
).all()
|
|
}
|
|
else:
|
|
seen_ids = set()
|
|
groups = [iter(data_ids)] * 100
|
|
for data_ids_group in zip_longest(*groups, fillvalue=None):
|
|
seen_ids |= {
|
|
data_id[0]
|
|
for data_id in session.execute(
|
|
data_ids_exist_in_events(*data_ids_group) # type: ignore[arg-type]
|
|
).all()
|
|
if data_id[0] is not None
|
|
}
|
|
to_remove = data_ids - seen_ids
|
|
_LOGGER.debug("Selected %s shared event data to remove", len(to_remove))
|
|
return to_remove
|
|
|
|
|
|
def _purge_unused_data_ids(
|
|
instance: Recorder, session: Session, data_ids_batch: set[int]
|
|
) -> None:
|
|
database_engine = instance.database_engine
|
|
assert database_engine is not None
|
|
if unused_data_ids_set := _select_unused_event_data_ids(
|
|
session, data_ids_batch, database_engine
|
|
):
|
|
_purge_batch_data_ids(instance, session, unused_data_ids_set)
|
|
|
|
|
|
def _select_statistics_runs_to_purge(
|
|
session: Session, purge_before: datetime
|
|
) -> list[int]:
|
|
"""Return a list of statistic runs to purge.
|
|
|
|
Takes care to keep the newest run.
|
|
"""
|
|
statistic_runs = session.execute(find_statistics_runs_to_purge(purge_before)).all()
|
|
statistic_runs_list = [run.run_id for run in statistic_runs]
|
|
# Exclude the newest statistics run
|
|
if (
|
|
last_run := session.execute(find_latest_statistics_runs_run_id()).scalar()
|
|
) and last_run in statistic_runs_list:
|
|
statistic_runs_list.remove(last_run)
|
|
|
|
_LOGGER.debug("Selected %s statistic runs to remove", len(statistic_runs))
|
|
return statistic_runs_list
|
|
|
|
|
|
def _select_short_term_statistics_to_purge(
|
|
session: Session, purge_before: datetime
|
|
) -> list[int]:
|
|
"""Return a list of short term statistics to purge."""
|
|
statistics = session.execute(
|
|
find_short_term_statistics_to_purge(purge_before)
|
|
).all()
|
|
_LOGGER.debug("Selected %s short term statistics to remove", len(statistics))
|
|
return [statistic.id for statistic in statistics]
|
|
|
|
|
|
def _select_legacy_event_state_and_attributes_and_data_ids_to_purge(
|
|
session: Session, purge_before: datetime
|
|
) -> tuple[set[int], set[int], set[int], set[int]]:
|
|
"""Return a list of event, state, and attribute ids to purge linked by the event_id.
|
|
|
|
We do not link these anymore since state_change events
|
|
do not exist in the events table anymore, however we
|
|
still need to be able to purge them.
|
|
"""
|
|
events = session.execute(
|
|
find_legacy_event_state_and_attributes_and_data_ids_to_purge(
|
|
dt_util.utc_to_timestamp(purge_before)
|
|
)
|
|
).all()
|
|
_LOGGER.debug("Selected %s event ids to remove", len(events))
|
|
event_ids = set()
|
|
state_ids = set()
|
|
attributes_ids = set()
|
|
data_ids = set()
|
|
for event in events:
|
|
event_ids.add(event.event_id)
|
|
if event.state_id:
|
|
state_ids.add(event.state_id)
|
|
if event.attributes_id:
|
|
attributes_ids.add(event.attributes_id)
|
|
if event.data_id:
|
|
data_ids.add(event.data_id)
|
|
return event_ids, state_ids, attributes_ids, data_ids
|
|
|
|
|
|
def _purge_state_ids(instance: Recorder, session: Session, state_ids: set[int]) -> None:
|
|
"""Disconnect states and delete by state id."""
|
|
if not state_ids:
|
|
return
|
|
|
|
# Update old_state_id to NULL before deleting to ensure
|
|
# the delete does not fail due to a foreign key constraint
|
|
# since some databases (MSSQL) cannot do the ON DELETE SET NULL
|
|
# for us.
|
|
disconnected_rows = session.execute(disconnect_states_rows(state_ids))
|
|
_LOGGER.debug("Updated %s states to remove old_state_id", disconnected_rows)
|
|
|
|
deleted_rows = session.execute(delete_states_rows(state_ids))
|
|
_LOGGER.debug("Deleted %s states", deleted_rows)
|
|
|
|
# Evict eny entries in the old_states cache referring to a purged state
|
|
instance.states_manager.evict_purged_state_ids(state_ids)
|
|
|
|
|
|
def _purge_batch_attributes_ids(
|
|
instance: Recorder, session: Session, attributes_ids: set[int]
|
|
) -> None:
|
|
"""Delete old attributes ids in batches of SQLITE_MAX_BIND_VARS."""
|
|
for attributes_ids_chunk in chunked(attributes_ids, SQLITE_MAX_BIND_VARS):
|
|
deleted_rows = session.execute(
|
|
delete_states_attributes_rows(attributes_ids_chunk)
|
|
)
|
|
_LOGGER.debug("Deleted %s attribute states", deleted_rows)
|
|
|
|
# Evict any entries in the state_attributes_ids cache referring to a purged state
|
|
instance.state_attributes_manager.evict_purged(attributes_ids)
|
|
|
|
|
|
def _purge_batch_data_ids(
|
|
instance: Recorder, session: Session, data_ids: set[int]
|
|
) -> None:
|
|
"""Delete old event data ids in batches of SQLITE_MAX_BIND_VARS."""
|
|
for data_ids_chunk in chunked(data_ids, SQLITE_MAX_BIND_VARS):
|
|
deleted_rows = session.execute(delete_event_data_rows(data_ids_chunk))
|
|
_LOGGER.debug("Deleted %s data events", deleted_rows)
|
|
|
|
# Evict any entries in the event_data_ids cache referring to a purged state
|
|
instance.event_data_manager.evict_purged(data_ids)
|
|
|
|
|
|
def _purge_statistics_runs(session: Session, statistics_runs: list[int]) -> None:
|
|
"""Delete by run_id."""
|
|
deleted_rows = session.execute(delete_statistics_runs_rows(statistics_runs))
|
|
_LOGGER.debug("Deleted %s statistic runs", deleted_rows)
|
|
|
|
|
|
def _purge_short_term_statistics(
|
|
session: Session, short_term_statistics: list[int]
|
|
) -> None:
|
|
"""Delete by id."""
|
|
deleted_rows = session.execute(
|
|
delete_statistics_short_term_rows(short_term_statistics)
|
|
)
|
|
_LOGGER.debug("Deleted %s short term statistics", deleted_rows)
|
|
|
|
|
|
def _purge_event_ids(session: Session, event_ids: set[int]) -> None:
|
|
"""Delete by event id."""
|
|
if not event_ids:
|
|
return
|
|
deleted_rows = session.execute(delete_event_rows(event_ids))
|
|
_LOGGER.debug("Deleted %s events", deleted_rows)
|
|
|
|
|
|
def _purge_old_recorder_runs(
|
|
instance: Recorder, session: Session, purge_before: datetime
|
|
) -> None:
|
|
"""Purge all old recorder runs."""
|
|
# Recorder runs is small, no need to batch run it
|
|
deleted_rows = session.execute(
|
|
delete_recorder_runs_rows(purge_before, instance.run_history.current.run_id)
|
|
)
|
|
_LOGGER.debug("Deleted %s recorder_runs", deleted_rows)
|
|
|
|
|
|
def _purge_old_event_types(instance: Recorder, session: Session) -> None:
|
|
"""Purge all old event types."""
|
|
# Event types is small, no need to batch run it
|
|
purge_event_types = set()
|
|
event_type_ids = set()
|
|
for event_type_id, event_type in session.execute(find_event_types_to_purge()):
|
|
purge_event_types.add(event_type)
|
|
event_type_ids.add(event_type_id)
|
|
|
|
if not event_type_ids:
|
|
return
|
|
|
|
deleted_rows = session.execute(delete_event_types_rows(event_type_ids))
|
|
_LOGGER.debug("Deleted %s event types", deleted_rows)
|
|
|
|
# Evict any entries in the event_type cache referring to a purged state
|
|
instance.event_type_manager.evict_purged(purge_event_types)
|
|
|
|
|
|
def _purge_old_entity_ids(instance: Recorder, session: Session) -> None:
|
|
"""Purge all old entity_ids."""
|
|
# entity_ids are small, no need to batch run it
|
|
purge_entity_ids = set()
|
|
states_metadata_ids = set()
|
|
for metadata_id, entity_id in session.execute(find_entity_ids_to_purge()):
|
|
purge_entity_ids.add(entity_id)
|
|
states_metadata_ids.add(metadata_id)
|
|
|
|
if not states_metadata_ids:
|
|
return
|
|
|
|
deleted_rows = session.execute(delete_states_meta_rows(states_metadata_ids))
|
|
_LOGGER.debug("Deleted %s states meta", deleted_rows)
|
|
|
|
# Evict any entries in the event_type cache referring to a purged state
|
|
instance.states_meta_manager.evict_purged(purge_entity_ids)
|
|
instance.states_manager.evict_purged_entity_ids(purge_entity_ids)
|
|
|
|
|
|
def _purge_filtered_data(instance: Recorder, session: Session) -> bool:
|
|
"""Remove filtered states and events that shouldn't be in the database."""
|
|
_LOGGER.debug("Cleanup filtered data")
|
|
database_engine = instance.database_engine
|
|
assert database_engine is not None
|
|
now_timestamp = time.time()
|
|
|
|
# Check if excluded entity_ids are in database
|
|
entity_filter = instance.entity_filter
|
|
has_more_states_to_purge = False
|
|
excluded_metadata_ids: list[str] = [
|
|
metadata_id
|
|
for (metadata_id, entity_id) in session.query(
|
|
StatesMeta.metadata_id, StatesMeta.entity_id
|
|
).all()
|
|
if not entity_filter(entity_id)
|
|
]
|
|
if excluded_metadata_ids:
|
|
has_more_states_to_purge = _purge_filtered_states(
|
|
instance, session, excluded_metadata_ids, database_engine, now_timestamp
|
|
)
|
|
|
|
# Check if excluded event_types are in database
|
|
has_more_events_to_purge = False
|
|
if (
|
|
event_type_to_event_type_ids := instance.event_type_manager.get_many(
|
|
instance.exclude_event_types, session
|
|
)
|
|
) and (
|
|
excluded_event_type_ids := [
|
|
event_type_id
|
|
for event_type_id in event_type_to_event_type_ids.values()
|
|
if event_type_id is not None
|
|
]
|
|
):
|
|
has_more_events_to_purge = _purge_filtered_events(
|
|
instance, session, excluded_event_type_ids, now_timestamp
|
|
)
|
|
|
|
# Purge has completed if there are not more state or events to purge
|
|
return not (has_more_states_to_purge or has_more_events_to_purge)
|
|
|
|
|
|
def _purge_filtered_states(
|
|
instance: Recorder,
|
|
session: Session,
|
|
metadata_ids_to_purge: list[str],
|
|
database_engine: DatabaseEngine,
|
|
purge_before_timestamp: float,
|
|
) -> bool:
|
|
"""Remove filtered states and linked events.
|
|
|
|
Return true if all states are purged
|
|
"""
|
|
state_ids: tuple[int, ...]
|
|
attributes_ids: tuple[int, ...]
|
|
event_ids: tuple[int, ...]
|
|
to_purge = list(
|
|
session.query(States.state_id, States.attributes_id, States.event_id)
|
|
.filter(States.metadata_id.in_(metadata_ids_to_purge))
|
|
.filter(States.last_updated_ts < purge_before_timestamp)
|
|
.limit(SQLITE_MAX_BIND_VARS)
|
|
.all()
|
|
)
|
|
if not to_purge:
|
|
return True
|
|
state_ids, attributes_ids, event_ids = zip(*to_purge)
|
|
filtered_event_ids = {id_ for id_ in event_ids if id_ is not None}
|
|
_LOGGER.debug(
|
|
"Selected %s state_ids to remove that should be filtered", len(state_ids)
|
|
)
|
|
_purge_state_ids(instance, session, set(state_ids))
|
|
# These are legacy events that are linked to a state that are no longer
|
|
# created but since we did not remove them when we stopped adding new ones
|
|
# we will need to purge them here.
|
|
_purge_event_ids(session, filtered_event_ids)
|
|
unused_attribute_ids_set = _select_unused_attributes_ids(
|
|
session, {id_ for id_ in attributes_ids if id_ is not None}, database_engine
|
|
)
|
|
_purge_batch_attributes_ids(instance, session, unused_attribute_ids_set)
|
|
return False
|
|
|
|
|
|
def _purge_filtered_events(
|
|
instance: Recorder,
|
|
session: Session,
|
|
excluded_event_type_ids: list[int],
|
|
purge_before_timestamp: float,
|
|
) -> bool:
|
|
"""Remove filtered events and linked states.
|
|
|
|
Return true if all events are purged.
|
|
"""
|
|
database_engine = instance.database_engine
|
|
assert database_engine is not None
|
|
to_purge = list(
|
|
session.query(Events.event_id, Events.data_id)
|
|
.filter(Events.event_type_id.in_(excluded_event_type_ids))
|
|
.filter(Events.time_fired_ts < purge_before_timestamp)
|
|
.limit(SQLITE_MAX_BIND_VARS)
|
|
.all()
|
|
)
|
|
if not to_purge:
|
|
return True
|
|
event_ids, data_ids = zip(*to_purge)
|
|
event_ids_set = set(event_ids)
|
|
_LOGGER.debug(
|
|
"Selected %s event_ids to remove that should be filtered", len(event_ids_set)
|
|
)
|
|
if (
|
|
instance.use_legacy_events_index
|
|
and (
|
|
states := session.query(States.state_id)
|
|
.filter(States.event_id.in_(event_ids_set))
|
|
.all()
|
|
)
|
|
and (state_ids := {state.state_id for state in states})
|
|
):
|
|
# These are legacy states that are linked to an event that are no longer
|
|
# created but since we did not remove them when we stopped adding new ones
|
|
# we will need to purge them here.
|
|
_purge_state_ids(instance, session, state_ids)
|
|
_purge_event_ids(session, event_ids_set)
|
|
if unused_data_ids_set := _select_unused_event_data_ids(
|
|
session, set(data_ids), database_engine
|
|
):
|
|
_purge_batch_data_ids(instance, session, unused_data_ids_set)
|
|
return False
|
|
|
|
|
|
@retryable_database_job("purge_entity_data")
|
|
def purge_entity_data(
|
|
instance: Recorder, entity_filter: Callable[[str], bool], purge_before: datetime
|
|
) -> bool:
|
|
"""Purge states and events of specified entities."""
|
|
database_engine = instance.database_engine
|
|
assert database_engine is not None
|
|
purge_before_timestamp = purge_before.timestamp()
|
|
with session_scope(session=instance.get_session()) as session:
|
|
selected_metadata_ids: list[str] = [
|
|
metadata_id
|
|
for (metadata_id, entity_id) in session.query(
|
|
StatesMeta.metadata_id, StatesMeta.entity_id
|
|
).all()
|
|
if entity_filter(entity_id)
|
|
]
|
|
_LOGGER.debug("Purging entity data for %s", selected_metadata_ids)
|
|
if not selected_metadata_ids:
|
|
return True
|
|
|
|
# Purge a max of SQLITE_MAX_BIND_VARS, based on the oldest states
|
|
# or events record.
|
|
if not _purge_filtered_states(
|
|
instance,
|
|
session,
|
|
selected_metadata_ids,
|
|
database_engine,
|
|
purge_before_timestamp,
|
|
):
|
|
_LOGGER.debug("Purging entity data hasn't fully completed yet")
|
|
return False
|
|
|
|
return True
|