Remove the old ix_states_event_id index if its no longer being used (#89901)

* Remove the old ix_states_event_id index if its no longer being used

* cover it

* fixes

* fixup

* Update homeassistant/components/recorder/tasks.py
This commit is contained in:
J. Nick Koston 2023-03-17 14:27:33 -10:00 committed by GitHub
parent 138bbd9c28
commit b1f64de6ce
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 124 additions and 22 deletions

View file

@ -48,6 +48,8 @@ CONTEXT_ID_AS_BINARY_SCHEMA_VERSION = 36
EVENT_TYPE_IDS_SCHEMA_VERSION = 37
STATES_META_SCHEMA_VERSION = 38
LEGACY_STATES_EVENT_ID_INDEX_SCHEMA_VERSION = 28
class SupportedDialect(StrEnum):
"""Supported dialects."""

View file

@ -46,6 +46,7 @@ from .const import (
DOMAIN,
EVENT_TYPE_IDS_SCHEMA_VERSION,
KEEPALIVE_TIME,
LEGACY_STATES_EVENT_ID_INDEX_SCHEMA_VERSION,
MARIADB_PYMYSQL_URL_PREFIX,
MARIADB_URL_PREFIX,
MAX_QUEUE_BACKLOG,
@ -57,7 +58,9 @@ from .const import (
SupportedDialect,
)
from .db_schema import (
LEGACY_STATES_EVENT_ID_INDEX,
SCHEMA_VERSION,
TABLE_STATES,
Base,
EventData,
Events,
@ -93,6 +96,7 @@ from .tasks import (
CompileMissingStatisticsTask,
DatabaseLockTask,
EntityIDMigrationTask,
EventIdMigrationTask,
EventsContextIDMigrationTask,
EventTask,
EventTypeIDMigrationTask,
@ -113,6 +117,7 @@ from .util import (
dburl_to_path,
end_incomplete_runs,
execute_stmt_lambda_element,
get_index_by_name,
is_second_sunday,
move_away_broken_database,
session_scope,
@ -730,6 +735,15 @@ class Recorder(threading.Thread):
_LOGGER.debug("Activating states_meta manager as all data is migrated")
self.states_meta_manager.active = True
if self.schema_version > LEGACY_STATES_EVENT_ID_INDEX_SCHEMA_VERSION:
with contextlib.suppress(SQLAlchemyError):
# If the index of event_ids on the states table is still present
# we need to queue a task to remove it.
if get_index_by_name(
session, TABLE_STATES, LEGACY_STATES_EVENT_ID_INDEX
):
self.queue_task(EventIdMigrationTask())
# We must only set the db ready after we have set the table managers
# to active if there is no data to migrate.
#
@ -1138,6 +1152,10 @@ class Recorder(threading.Thread):
"""Post migrate entity_ids if needed."""
return migration.post_migrate_entity_ids(self)
def _cleanup_legacy_states_event_ids(self) -> bool:
"""Cleanup legacy event_ids if needed."""
return migration.cleanup_legacy_states_event_ids(self)
def _send_keep_alive(self) -> None:
"""Send a keep alive to keep the db connection open."""
assert self.event_session is not None

View file

@ -116,6 +116,7 @@ LAST_UPDATED_INDEX_TS = "ix_states_last_updated_ts"
METADATA_ID_LAST_UPDATED_INDEX_TS = "ix_states_metadata_id_last_updated_ts"
EVENTS_CONTEXT_ID_BIN_INDEX = "ix_events_context_id_bin"
STATES_CONTEXT_ID_BIN_INDEX = "ix_states_context_id_bin"
LEGACY_STATES_EVENT_ID_INDEX = "ix_states_event_id"
CONTEXT_ID_BIN_MAX_LENGTH = 16
_DEFAULT_TABLE_ARGS = {
@ -385,9 +386,7 @@ class States(Base):
attributes: Mapped[str | None] = mapped_column(
Text().with_variant(mysql.LONGTEXT, "mysql", "mariadb")
) # no longer used for new rows
event_id: Mapped[int | None] = mapped_column( # no longer used for new rows
Integer, ForeignKey("events.event_id", ondelete="CASCADE"), index=True
)
event_id: Mapped[int | None] = mapped_column(Integer) # no longer used for new rows
last_changed: Mapped[datetime | None] = mapped_column(
DATETIME_TYPE
) # no longer used for new rows

View file

@ -30,6 +30,7 @@ from homeassistant.util.ulid import ulid_to_bytes
from .const import SupportedDialect
from .db_schema import (
CONTEXT_ID_BIN_MAX_LENGTH,
LEGACY_STATES_EVENT_ID_INDEX,
SCHEMA_VERSION,
STATISTICS_TABLES,
TABLE_STATES,
@ -51,6 +52,7 @@ from .queries import (
find_event_type_to_migrate,
find_events_context_ids_to_migrate,
find_states_context_ids_to_migrate,
has_used_states_event_ids,
)
from .statistics import (
correct_db_schema as statistics_correct_db_schema,
@ -64,7 +66,12 @@ from .tasks import (
PostSchemaMigrationTask,
StatisticsTimestampMigrationCleanupTask,
)
from .util import database_job_retry_wrapper, retryable_database_job, session_scope
from .util import (
database_job_retry_wrapper,
get_index_by_name,
retryable_database_job,
session_scope,
)
if TYPE_CHECKING:
from . import Recorder
@ -308,18 +315,7 @@ def _drop_index(
with session_scope(session=session_maker()) as session, contextlib.suppress(
SQLAlchemyError
):
connection = session.connection()
inspector = sqlalchemy.inspect(connection)
indexes = inspector.get_indexes(table_name)
if index_to_drop := next(
(
possible_index["name"]
for possible_index in indexes
if possible_index["name"]
and possible_index["name"].endswith(f"_{index_name}")
),
None,
):
if index_to_drop := get_index_by_name(session, table_name, index_name):
connection.execute(text(f"DROP INDEX {index_to_drop}"))
success = True
@ -593,7 +589,7 @@ def _apply_update( # noqa: C901
# but it was removed in version 32
elif new_version == 5:
# Create supporting index for States.event_id foreign key
_create_index(session_maker, "states", "ix_states_event_id")
_create_index(session_maker, "states", LEGACY_STATES_EVENT_ID_INDEX)
elif new_version == 6:
_add_columns(
session_maker,
@ -1529,6 +1525,33 @@ def post_migrate_entity_ids(instance: Recorder) -> bool:
return is_done
@retryable_database_job("cleanup_legacy_event_ids")
def cleanup_legacy_states_event_ids(instance: Recorder) -> bool:
"""Remove old event_id index from states.
We used to link states to events using the event_id column but we no
longer store state changed events in the events table.
If all old states have been purged and existing states are in the new
format we can drop the index since it can take up ~10MB per 1M rows.
"""
session_maker = instance.get_session
_LOGGER.debug("Cleanup legacy entity_ids")
with session_scope(session=session_maker()) as session:
result = session.execute(has_used_states_event_ids()).scalar()
# In the future we may migrate existing states to the new format
# but in practice very few of these still exist in production and
# removing the index is the likely all that needs to happen.
all_gone = not result
if all_gone:
# Only drop the index if there are no more event_ids in the states table
# ex all NULL
_drop_index(session_maker, "states", LEGACY_STATES_EVENT_ID_INDEX)
return True
def _initialize_database(session: Session) -> bool:
"""Initialize a new database.

View file

@ -745,6 +745,13 @@ def batch_cleanup_entity_ids() -> StatementLambdaElement:
)
def has_used_states_event_ids() -> StatementLambdaElement:
"""Check if there are used event_ids in the states table."""
return lambda_stmt(
lambda: select(States.state_id).filter(States.event_id.isnot(None)).limit(1)
)
def has_events_context_ids_to_migrate() -> StatementLambdaElement:
"""Check if there are events context ids to migrate."""
return lambda_stmt(

View file

@ -438,3 +438,17 @@ class EntityIDPostMigrationTask(RecorderTask):
):
# Schedule a new migration task if this one didn't finish
instance.queue_task(EntityIDPostMigrationTask())
@dataclass
class EventIdMigrationTask(RecorderTask):
"""An object to insert into the recorder queue to cleanup legacy event_ids in the states table.
This task should only be queued if the ix_states_event_id index exists
since it is used to scan the states table and it will be removed after this
task is run if its no longer needed.
"""
def run(self, instance: Recorder) -> None:
"""Clean up the legacy event_id index on states."""
instance._cleanup_legacy_states_event_ids() # pylint: disable=[protected-access]

View file

@ -18,7 +18,7 @@ from awesomeversion import (
AwesomeVersionStrategy,
)
import ciso8601
from sqlalchemy import text
from sqlalchemy import inspect, text
from sqlalchemy.engine import Result, Row
from sqlalchemy.exc import OperationalError, SQLAlchemyError
from sqlalchemy.orm.query import Query
@ -832,3 +832,22 @@ def chunked(iterable: Iterable, chunked_num: int) -> Iterable[Any]:
From more-itertools
"""
return iter(partial(take, chunked_num, iter(iterable)), [])
def get_index_by_name(session: Session, table_name: str, index_name: str) -> str | None:
"""Get an index by name."""
connection = session.connection()
inspector = inspect(connection)
indexes = inspector.get_indexes(table_name)
return next(
(
possible_index["name"]
for possible_index in indexes
if possible_index["name"]
and (
possible_index["name"] == index_name
or possible_index["name"].endswith(f"_{index_name}")
)
),
None,
)

View file

@ -91,6 +91,10 @@ async def test_migrate_times(
)
number_of_migrations = 5
def _get_states_index_names():
with session_scope(hass=hass) as session:
return inspect(session.connection()).get_indexes("states")
with patch.object(recorder, "db_schema", old_db_schema), patch.object(
recorder.migration, "SCHEMA_VERSION", old_db_schema.SCHEMA_VERSION
), patch.object(core, "StatesMeta", old_db_schema.StatesMeta), patch.object(
@ -113,6 +117,8 @@ async def test_migrate_times(
"homeassistant.components.recorder.Recorder._migrate_entity_ids",
), patch(
"homeassistant.components.recorder.Recorder._post_migrate_entity_ids"
), patch(
"homeassistant.components.recorder.Recorder._cleanup_legacy_states_event_ids"
):
hass = await async_test_home_assistant(asyncio.get_running_loop())
recorder_helper.async_initialize_recorder(hass)
@ -132,11 +138,18 @@ async def test_migrate_times(
await hass.async_block_till_done()
await recorder.get_instance(hass).async_block_till_done()
states_indexes = await recorder.get_instance(hass).async_add_executor_job(
_get_states_index_names
)
states_index_names = {index["name"] for index in states_indexes}
await hass.async_stop()
await hass.async_block_till_done()
dt_util.DEFAULT_TIME_ZONE = ORIG_TZ
assert "ix_states_event_id" in states_index_names
# Test that the duplicates are removed during migration from schema 23
hass = await async_test_home_assistant(asyncio.get_running_loop())
recorder_helper.async_initialize_recorder(hass)
@ -186,13 +199,20 @@ async def test_migrate_times(
with session_scope(hass=hass) as session:
return inspect(session.connection()).get_indexes("events")
indexes = await recorder.get_instance(hass).async_add_executor_job(
events_indexes = await recorder.get_instance(hass).async_add_executor_job(
_get_events_index_names
)
index_names = {index["name"] for index in indexes}
events_index_names = {index["name"] for index in events_indexes}
assert "ix_events_context_id_bin" in index_names
assert "ix_events_context_id" not in index_names
assert "ix_events_context_id_bin" in events_index_names
assert "ix_events_context_id" not in events_index_names
states_indexes = await recorder.get_instance(hass).async_add_executor_job(
_get_states_index_names
)
states_index_names = {index["name"] for index in states_indexes}
assert "ix_states_event_id" not in states_index_names
await hass.async_stop()
dt_util.DEFAULT_TIME_ZONE = ORIG_TZ