From ee98849360a284447aa6238e20c670f098300b46 Mon Sep 17 00:00:00 2001
From: Erik Montnemery <erik@montnemery.com>
Date: Tue, 12 Oct 2021 19:39:36 +0200
Subject: [PATCH] Always include start point for statistics (#57182)

---
 .../components/recorder/statistics.py         | 65 +++++++++++++++++--
 tests/components/sensor/test_recorder.py      | 23 ++++++-
 2 files changed, 80 insertions(+), 8 deletions(-)

diff --git a/homeassistant/components/recorder/statistics.py b/homeassistant/components/recorder/statistics.py
index 200da8d192d..2dc18d3aecb 100644
--- a/homeassistant/components/recorder/statistics.py
+++ b/homeassistant/components/recorder/statistics.py
@@ -5,7 +5,7 @@ from collections import defaultdict
 from collections.abc import Callable, Iterable
 import dataclasses
 from datetime import datetime, timedelta
-from itertools import groupby
+from itertools import chain, groupby
 import logging
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -629,7 +629,7 @@ def statistics_during_period(
             return {}
         # Return statistics combined with metadata
         return _sorted_statistics_to_dict(
-            hass, stats, statistic_ids, metadata, True, table.duration
+            hass, session, stats, statistic_ids, metadata, True, table, start_time
         )
 
 
@@ -668,25 +668,64 @@ def get_last_statistics(
         # Return statistics combined with metadata
         return _sorted_statistics_to_dict(
             hass,
+            session,
             stats,
             statistic_ids,
             metadata,
             convert_units,
-            StatisticsShortTerm.duration,
+            StatisticsShortTerm,
+            None,
         )
 
 
+def _statistics_at_time(
+    session: scoped_session,
+    metadata_ids: set[int],
+    table: type[Statistics | StatisticsShortTerm],
+    start_time: datetime,
+) -> list | None:
+    """Return last known statics, earlier than start_time, for the metadata_ids."""
+    # Fetch metadata for the given (or all) statistic_ids
+    if table == StatisticsShortTerm:
+        base_query = QUERY_STATISTICS_SHORT_TERM
+    else:
+        base_query = QUERY_STATISTICS
+
+    query = session.query(*base_query)
+
+    most_recent_statistic_ids = (
+        session.query(
+            func.max(table.id).label("max_id"),
+        )
+        .filter(table.start < start_time)
+        .filter(table.metadata_id.in_(metadata_ids))
+    )
+    most_recent_statistic_ids = most_recent_statistic_ids.group_by(table.metadata_id)
+    most_recent_statistic_ids = most_recent_statistic_ids.subquery()
+    query = query.join(
+        most_recent_statistic_ids,
+        table.id == most_recent_statistic_ids.c.max_id,
+    )
+
+    return execute(query)
+
+
 def _sorted_statistics_to_dict(
     hass: HomeAssistant,
+    session: scoped_session,
     stats: list,
     statistic_ids: list[str] | None,
     _metadata: dict[str, tuple[int, StatisticMetaData]],
     convert_units: bool,
-    duration: timedelta,
+    table: type[Statistics | StatisticsShortTerm],
+    start_time: datetime | None,
 ) -> dict[str, list[dict]]:
     """Convert SQL results into JSON friendly data structure."""
     result: dict = defaultdict(list)
     units = hass.config.units
+    metadata = dict(_metadata.values())
+    need_stat_at_start_time = set()
+    stats_at_start_time = {}
 
     def no_conversion(val: Any, _: Any) -> float | None:
         """Return x."""
@@ -697,7 +736,19 @@ def _sorted_statistics_to_dict(
         for stat_id in statistic_ids:
             result[stat_id] = []
 
-    metadata = dict(_metadata.values())
+    # Identify metadata IDs for which no data was available at the requested start time
+    for meta_id, group in groupby(stats, lambda stat: stat.metadata_id):  # type: ignore
+        first_start_time = process_timestamp(next(group).start)
+        if start_time and first_start_time > start_time:
+            need_stat_at_start_time.add(meta_id)
+
+    # Fetch last known statistics for the needed metadata IDs
+    if need_stat_at_start_time:
+        assert start_time  # Can not be None if need_stat_at_start_time is not empty
+        tmp = _statistics_at_time(session, need_stat_at_start_time, table, start_time)
+        if tmp:
+            for stat in tmp:
+                stats_at_start_time[stat.metadata_id] = (stat,)
 
     # Append all statistic entries, and optionally do unit conversion
     for meta_id, group in groupby(stats, lambda stat: stat.metadata_id):  # type: ignore
@@ -709,9 +760,9 @@ def _sorted_statistics_to_dict(
         else:
             convert = no_conversion
         ent_results = result[meta_id]
-        for db_state in group:
+        for db_state in chain(stats_at_start_time.get(meta_id, ()), group):
             start = process_timestamp(db_state.start)
-            end = start + duration
+            end = start + table.duration
             ent_results.append(
                 {
                     "statistic_id": statistic_id,
diff --git a/tests/components/sensor/test_recorder.py b/tests/components/sensor/test_recorder.py
index 6fe90a26ded..96335d435da 100644
--- a/tests/components/sensor/test_recorder.py
+++ b/tests/components/sensor/test_recorder.py
@@ -337,7 +337,7 @@ def test_compile_hourly_sum_statistics_amount(
         {"statistic_id": "sensor.test1", "unit_of_measurement": display_unit}
     ]
     stats = statistics_during_period(hass, period0, period="5minute")
-    assert stats == {
+    expected_stats = {
         "sensor.test1": [
             {
                 "statistic_id": "sensor.test1",
@@ -374,6 +374,27 @@ def test_compile_hourly_sum_statistics_amount(
             },
         ]
     }
+    assert stats == expected_stats
+
+    # With an offset of 1 minute, we expect to get all periods
+    stats = statistics_during_period(
+        hass, period0 + timedelta(minutes=1), period="5minute"
+    )
+    assert stats == expected_stats
+
+    # With an offset of 5 minutes, we expect to get the 2nd and 3rd periods
+    stats = statistics_during_period(
+        hass, period0 + timedelta(minutes=5), period="5minute"
+    )
+    expected_stats["sensor.test1"] = expected_stats["sensor.test1"][1:3]
+    assert stats == expected_stats
+
+    # With an offset of 6 minutes, we expect to get the 2nd and 3rd periods
+    stats = statistics_during_period(
+        hass, period0 + timedelta(minutes=6), period="5minute"
+    )
+    assert stats == expected_stats
+
     assert "Error while processing event StatisticsTask" not in caplog.text
     assert "Detected new cycle for sensor.test1, last_reset set to" in caplog.text
     assert "Compiling initial sum statistics for sensor.test1" in caplog.text