From 9fbde245d01b9465c7a489962b2142f554a5dbf5 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 2 Jun 2022 11:54:06 -1000 Subject: [PATCH] Fix performance of logbook entity and devices queries with large MySQL databases (#72898) --- .../components/logbook/queries/common.py | 20 +++++++-- .../components/logbook/queries/devices.py | 32 +++++++++----- .../components/logbook/queries/entities.py | 39 ++++++++++------ .../logbook/queries/entities_and_devices.py | 44 ++++++++++++------- homeassistant/components/recorder/models.py | 2 + 5 files changed, 93 insertions(+), 44 deletions(-) diff --git a/homeassistant/components/logbook/queries/common.py b/homeassistant/components/logbook/queries/common.py index 6049d6beb81..a7a4f84a59e 100644 --- a/homeassistant/components/logbook/queries/common.py +++ b/homeassistant/components/logbook/queries/common.py @@ -12,9 +12,11 @@ from sqlalchemy.sql.selectable import Select from homeassistant.components.proximity import DOMAIN as PROXIMITY_DOMAIN from homeassistant.components.recorder.models import ( + EVENTS_CONTEXT_ID_INDEX, OLD_FORMAT_ATTRS_JSON, OLD_STATE, SHARED_ATTRS_JSON, + STATES_CONTEXT_ID_INDEX, EventData, Events, StateAttributes, @@ -121,9 +123,7 @@ def select_events_context_only() -> Select: By marking them as context_only we know they are only for linking context ids and we can avoid processing them. """ - return select(*EVENT_ROWS_NO_STATES, CONTEXT_ONLY).outerjoin( - EventData, (Events.data_id == EventData.data_id) - ) + return select(*EVENT_ROWS_NO_STATES, CONTEXT_ONLY) def select_states_context_only() -> Select: @@ -252,3 +252,17 @@ def _not_uom_attributes_matcher() -> ClauseList: return ~StateAttributes.shared_attrs.like( UNIT_OF_MEASUREMENT_JSON_LIKE ) | ~States.attributes.like(UNIT_OF_MEASUREMENT_JSON_LIKE) + + +def apply_states_context_hints(query: Query) -> Query: + """Force mysql to use the right index on large context_id selects.""" + return query.with_hint( + States, f"FORCE INDEX ({STATES_CONTEXT_ID_INDEX})", dialect_name="mysql" + ) + + +def apply_events_context_hints(query: Query) -> Query: + """Force mysql to use the right index on large context_id selects.""" + return query.with_hint( + Events, f"FORCE INDEX ({EVENTS_CONTEXT_ID_INDEX})", dialect_name="mysql" + ) diff --git a/homeassistant/components/logbook/queries/devices.py b/homeassistant/components/logbook/queries/devices.py index 64a6477017e..88e9f50a42c 100644 --- a/homeassistant/components/logbook/queries/devices.py +++ b/homeassistant/components/logbook/queries/devices.py @@ -4,15 +4,22 @@ from __future__ import annotations from collections.abc import Iterable from datetime import datetime as dt -from sqlalchemy import lambda_stmt, select, union_all +from sqlalchemy import lambda_stmt, select from sqlalchemy.orm import Query from sqlalchemy.sql.elements import ClauseList from sqlalchemy.sql.lambdas import StatementLambdaElement from sqlalchemy.sql.selectable import CTE, CompoundSelect -from homeassistant.components.recorder.models import DEVICE_ID_IN_EVENT, Events, States +from homeassistant.components.recorder.models import ( + DEVICE_ID_IN_EVENT, + EventData, + Events, + States, +) from .common import ( + apply_events_context_hints, + apply_states_context_hints, select_events_context_id_subquery, select_events_context_only, select_events_without_states, @@ -27,13 +34,10 @@ def _select_device_id_context_ids_sub_query( json_quotable_device_ids: list[str], ) -> CompoundSelect: """Generate a subquery to find context ids for multiple devices.""" - return select( - union_all( - select_events_context_id_subquery(start_day, end_day, event_types).where( - apply_event_device_id_matchers(json_quotable_device_ids) - ), - ).c.context_id + inner = select_events_context_id_subquery(start_day, end_day, event_types).where( + apply_event_device_id_matchers(json_quotable_device_ids) ) + return select(inner.c.context_id).group_by(inner.c.context_id) def _apply_devices_context_union( @@ -51,8 +55,16 @@ def _apply_devices_context_union( json_quotable_device_ids, ).cte() return query.union_all( - select_events_context_only().where(Events.context_id.in_(devices_cte.select())), - select_states_context_only().where(States.context_id.in_(devices_cte.select())), + apply_events_context_hints( + select_events_context_only() + .select_from(devices_cte) + .outerjoin(Events, devices_cte.c.context_id == Events.context_id) + ).outerjoin(EventData, (Events.data_id == EventData.data_id)), + apply_states_context_hints( + select_states_context_only() + .select_from(devices_cte) + .outerjoin(States, devices_cte.c.context_id == States.context_id) + ), ) diff --git a/homeassistant/components/logbook/queries/entities.py b/homeassistant/components/logbook/queries/entities.py index 4fb211688f3..8de4a5eaf64 100644 --- a/homeassistant/components/logbook/queries/entities.py +++ b/homeassistant/components/logbook/queries/entities.py @@ -14,11 +14,14 @@ from homeassistant.components.recorder.models import ( ENTITY_ID_IN_EVENT, ENTITY_ID_LAST_UPDATED_INDEX, OLD_ENTITY_ID_IN_EVENT, + EventData, Events, States, ) from .common import ( + apply_events_context_hints, + apply_states_context_hints, apply_states_filters, select_events_context_id_subquery, select_events_context_only, @@ -36,16 +39,15 @@ def _select_entities_context_ids_sub_query( json_quotable_entity_ids: list[str], ) -> CompoundSelect: """Generate a subquery to find context ids for multiple entities.""" - return select( - union_all( - select_events_context_id_subquery(start_day, end_day, event_types).where( - apply_event_entity_id_matchers(json_quotable_entity_ids) - ), - apply_entities_hints(select(States.context_id)) - .filter((States.last_updated > start_day) & (States.last_updated < end_day)) - .where(States.entity_id.in_(entity_ids)), - ).c.context_id + union = union_all( + select_events_context_id_subquery(start_day, end_day, event_types).where( + apply_event_entity_id_matchers(json_quotable_entity_ids) + ), + apply_entities_hints(select(States.context_id)) + .filter((States.last_updated > start_day) & (States.last_updated < end_day)) + .where(States.entity_id.in_(entity_ids)), ) + return select(union.c.context_id).group_by(union.c.context_id) def _apply_entities_context_union( @@ -64,14 +66,23 @@ def _apply_entities_context_union( entity_ids, json_quotable_entity_ids, ).cte() + # We used to optimize this to exclude rows we already in the union with + # a States.entity_id.not_in(entity_ids) but that made the + # query much slower on MySQL, and since we already filter them away + # in the python code anyways since they will have context_only + # set on them the impact is minimal. return query.union_all( states_query_for_entity_ids(start_day, end_day, entity_ids), - select_events_context_only().where( - Events.context_id.in_(entities_cte.select()) + apply_events_context_hints( + select_events_context_only() + .select_from(entities_cte) + .outerjoin(Events, entities_cte.c.context_id == Events.context_id) + ).outerjoin(EventData, (Events.data_id == EventData.data_id)), + apply_states_context_hints( + select_states_context_only() + .select_from(entities_cte) + .outerjoin(States, entities_cte.c.context_id == States.context_id) ), - select_states_context_only() - .where(States.entity_id.not_in(entity_ids)) - .where(States.context_id.in_(entities_cte.select())), ) diff --git a/homeassistant/components/logbook/queries/entities_and_devices.py b/homeassistant/components/logbook/queries/entities_and_devices.py index d1c86ddbec5..1c4271422b7 100644 --- a/homeassistant/components/logbook/queries/entities_and_devices.py +++ b/homeassistant/components/logbook/queries/entities_and_devices.py @@ -10,9 +10,11 @@ from sqlalchemy.orm import Query from sqlalchemy.sql.lambdas import StatementLambdaElement from sqlalchemy.sql.selectable import CTE, CompoundSelect -from homeassistant.components.recorder.models import Events, States +from homeassistant.components.recorder.models import EventData, Events, States from .common import ( + apply_events_context_hints, + apply_states_context_hints, select_events_context_id_subquery, select_events_context_only, select_events_without_states, @@ -35,18 +37,17 @@ def _select_entities_device_id_context_ids_sub_query( json_quotable_device_ids: list[str], ) -> CompoundSelect: """Generate a subquery to find context ids for multiple entities and multiple devices.""" - return select( - union_all( - select_events_context_id_subquery(start_day, end_day, event_types).where( - _apply_event_entity_id_device_id_matchers( - json_quotable_entity_ids, json_quotable_device_ids - ) - ), - apply_entities_hints(select(States.context_id)) - .filter((States.last_updated > start_day) & (States.last_updated < end_day)) - .where(States.entity_id.in_(entity_ids)), - ).c.context_id + union = union_all( + select_events_context_id_subquery(start_day, end_day, event_types).where( + _apply_event_entity_id_device_id_matchers( + json_quotable_entity_ids, json_quotable_device_ids + ) + ), + apply_entities_hints(select(States.context_id)) + .filter((States.last_updated > start_day) & (States.last_updated < end_day)) + .where(States.entity_id.in_(entity_ids)), ) + return select(union.c.context_id).group_by(union.c.context_id) def _apply_entities_devices_context_union( @@ -66,14 +67,23 @@ def _apply_entities_devices_context_union( json_quotable_entity_ids, json_quotable_device_ids, ).cte() + # We used to optimize this to exclude rows we already in the union with + # a States.entity_id.not_in(entity_ids) but that made the + # query much slower on MySQL, and since we already filter them away + # in the python code anyways since they will have context_only + # set on them the impact is minimal. return query.union_all( states_query_for_entity_ids(start_day, end_day, entity_ids), - select_events_context_only().where( - Events.context_id.in_(devices_entities_cte.select()) + apply_events_context_hints( + select_events_context_only() + .select_from(devices_entities_cte) + .outerjoin(Events, devices_entities_cte.c.context_id == Events.context_id) + ).outerjoin(EventData, (Events.data_id == EventData.data_id)), + apply_states_context_hints( + select_states_context_only() + .select_from(devices_entities_cte) + .outerjoin(States, devices_entities_cte.c.context_id == States.context_id) ), - select_states_context_only() - .where(States.entity_id.not_in(entity_ids)) - .where(States.context_id.in_(devices_entities_cte.select())), ) diff --git a/homeassistant/components/recorder/models.py b/homeassistant/components/recorder/models.py index 70c816c2af5..8db648f15a8 100644 --- a/homeassistant/components/recorder/models.py +++ b/homeassistant/components/recorder/models.py @@ -93,6 +93,8 @@ TABLES_TO_CHECK = [ LAST_UPDATED_INDEX = "ix_states_last_updated" ENTITY_ID_LAST_UPDATED_INDEX = "ix_states_entity_id_last_updated" +EVENTS_CONTEXT_ID_INDEX = "ix_events_context_id" +STATES_CONTEXT_ID_INDEX = "ix_states_context_id" EMPTY_JSON_OBJECT = "{}"