From eac9ad8437f33ee958e40d394d2e4fe605970d5e Mon Sep 17 00:00:00 2001
From: "J. Nick Koston" <nick@koston.org>
Date: Sun, 19 Feb 2023 20:05:45 -0600
Subject: [PATCH] Fix most recent states query not using the index for group by
 (#88461)

* Fix most recent states query not using the index for group by

fixes #87851

* Apply suggestions from code review

* reduce
---
 homeassistant/components/recorder/history.py | 145 ++++++++++---------
 1 file changed, 78 insertions(+), 67 deletions(-)

diff --git a/homeassistant/components/recorder/history.py b/homeassistant/components/recorder/history.py
index 387cd405759..e466cd07e0e 100644
--- a/homeassistant/components/recorder/history.py
+++ b/homeassistant/components/recorder/history.py
@@ -583,7 +583,49 @@ def get_last_state_changes(
         )
 
 
-def _get_states_for_entites_stmt(
+def _generate_most_recent_states_for_entities_by_date(
+    schema_version: int,
+    run_start: datetime,
+    utc_point_in_time: datetime,
+    entity_ids: list[str],
+) -> Subquery:
+    """Generate the sub query for the most recent states for specific entities by date."""
+    if schema_version >= 31:
+        run_start_ts = process_timestamp(run_start).timestamp()
+        utc_point_in_time_ts = dt_util.utc_to_timestamp(utc_point_in_time)
+        return (
+            select(
+                States.entity_id.label("max_entity_id"),
+                # https://github.com/sqlalchemy/sqlalchemy/issues/9189
+                # pylint: disable-next=not-callable
+                func.max(States.last_updated_ts).label("max_last_updated"),
+            )
+            .filter(
+                (States.last_updated_ts >= run_start_ts)
+                & (States.last_updated_ts < utc_point_in_time_ts)
+            )
+            .filter(States.entity_id.in_(entity_ids))
+            .group_by(States.entity_id)
+            .subquery()
+        )
+    return (
+        select(
+            States.entity_id.label("max_entity_id"),
+            # https://github.com/sqlalchemy/sqlalchemy/issues/9189
+            # pylint: disable-next=not-callable
+            func.max(States.last_updated).label("max_last_updated"),
+        )
+        .filter(
+            (States.last_updated >= run_start)
+            & (States.last_updated < utc_point_in_time)
+        )
+        .filter(States.entity_id.in_(entity_ids))
+        .group_by(States.entity_id)
+        .subquery()
+    )
+
+
+def _get_states_for_entities_stmt(
     schema_version: int,
     run_start: datetime,
     utc_point_in_time: datetime,
@@ -594,41 +636,32 @@ def _get_states_for_entites_stmt(
     stmt, join_attributes = lambda_stmt_and_join_attributes(
         schema_version, no_attributes, include_last_changed=True
     )
+    most_recent_states_for_entities_by_date = (
+        _generate_most_recent_states_for_entities_by_date(
+            schema_version, run_start, utc_point_in_time, entity_ids
+        )
+    )
     # We got an include-list of entities, accelerate the query by filtering already
     # in the inner query.
     if schema_version >= 31:
-        run_start_ts = process_timestamp(run_start).timestamp()
-        utc_point_in_time_ts = dt_util.utc_to_timestamp(utc_point_in_time)
-        stmt += lambda q: q.where(
-            States.state_id
-            == (
-                # https://github.com/sqlalchemy/sqlalchemy/issues/9189
-                # pylint: disable-next=not-callable
-                select(func.max(States.state_id).label("max_state_id"))
-                .filter(
-                    (States.last_updated_ts >= run_start_ts)
-                    & (States.last_updated_ts < utc_point_in_time_ts)
-                )
-                .filter(States.entity_id.in_(entity_ids))
-                .group_by(States.entity_id)
-                .subquery()
-            ).c.max_state_id
+        stmt += lambda q: q.join(
+            most_recent_states_for_entities_by_date,
+            and_(
+                States.entity_id
+                == most_recent_states_for_entities_by_date.c.max_entity_id,
+                States.last_updated_ts
+                == most_recent_states_for_entities_by_date.c.max_last_updated,
+            ),
         )
     else:
-        stmt += lambda q: q.where(
-            States.state_id
-            == (
-                # https://github.com/sqlalchemy/sqlalchemy/issues/9189
-                # pylint: disable-next=not-callable
-                select(func.max(States.state_id).label("max_state_id"))
-                .filter(
-                    (States.last_updated >= run_start)
-                    & (States.last_updated < utc_point_in_time)
-                )
-                .filter(States.entity_id.in_(entity_ids))
-                .group_by(States.entity_id)
-                .subquery()
-            ).c.max_state_id
+        stmt += lambda q: q.join(
+            most_recent_states_for_entities_by_date,
+            and_(
+                States.entity_id
+                == most_recent_states_for_entities_by_date.c.max_entity_id,
+                States.last_updated
+                == most_recent_states_for_entities_by_date.c.max_last_updated,
+            ),
         )
     if join_attributes:
         stmt += lambda q: q.outerjoin(
@@ -642,7 +675,7 @@ def _generate_most_recent_states_by_date(
     run_start: datetime,
     utc_point_in_time: datetime,
 ) -> Subquery:
-    """Generate the sub query for the most recent states by data."""
+    """Generate the sub query for the most recent states by date."""
     if schema_version >= 31:
         run_start_ts = process_timestamp(run_start).timestamp()
         utc_point_in_time_ts = dt_util.utc_to_timestamp(utc_point_in_time)
@@ -695,42 +728,20 @@ def _get_states_for_all_stmt(
         schema_version, run_start, utc_point_in_time
     )
     if schema_version >= 31:
-        stmt += lambda q: q.where(
-            States.state_id
-            == (
-                # https://github.com/sqlalchemy/sqlalchemy/issues/9189
-                # pylint: disable-next=not-callable
-                select(func.max(States.state_id).label("max_state_id"))
-                .join(
-                    most_recent_states_by_date,
-                    and_(
-                        States.entity_id == most_recent_states_by_date.c.max_entity_id,
-                        States.last_updated_ts
-                        == most_recent_states_by_date.c.max_last_updated,
-                    ),
-                )
-                .group_by(States.entity_id)
-                .subquery()
-            ).c.max_state_id,
+        stmt += lambda q: q.join(
+            most_recent_states_by_date,
+            and_(
+                States.entity_id == most_recent_states_by_date.c.max_entity_id,
+                States.last_updated_ts == most_recent_states_by_date.c.max_last_updated,
+            ),
         )
     else:
-        stmt += lambda q: q.where(
-            States.state_id
-            == (
-                # https://github.com/sqlalchemy/sqlalchemy/issues/9189
-                # pylint: disable-next=not-callable
-                select(func.max(States.state_id).label("max_state_id"))
-                .join(
-                    most_recent_states_by_date,
-                    and_(
-                        States.entity_id == most_recent_states_by_date.c.max_entity_id,
-                        States.last_updated
-                        == most_recent_states_by_date.c.max_last_updated,
-                    ),
-                )
-                .group_by(States.entity_id)
-                .subquery()
-            ).c.max_state_id,
+        stmt += lambda q: q.join(
+            most_recent_states_by_date,
+            and_(
+                States.entity_id == most_recent_states_by_date.c.max_entity_id,
+                States.last_updated == most_recent_states_by_date.c.max_last_updated,
+            ),
         )
     stmt += _ignore_domains_filter
     if filters and filters.has_config:
@@ -772,7 +783,7 @@ def _get_rows_with_session(
     # We have more than one entity to look at so we need to do a query on states
     # since the last recorder run started.
     if entity_ids:
-        stmt = _get_states_for_entites_stmt(
+        stmt = _get_states_for_entities_stmt(
             schema_version, run.start, utc_point_in_time, entity_ids, no_attributes
         )
     else: