Remove duplicated statistics rows (#61146)

* Remove duplicated statistics

* Fix misleading docstring

* Pylint knows best

* Correct test

* Oops

* Prevent insertion of duplicated statistics

* Tweak

* pylint

* Add models_schema_23.py

* Tweak
This commit is contained in:
Erik Montnemery 2021-12-13 14:15:36 +01:00 committed by GitHub
parent 3635946211
commit bceeaec2f8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 1429 additions and 14 deletions

View file

@ -25,7 +25,7 @@ from .models import (
StatisticsShortTerm,
process_timestamp,
)
from .statistics import get_start_time
from .statistics import delete_duplicates, get_start_time
from .util import session_scope
_LOGGER = logging.getLogger(__name__)
@ -587,6 +587,22 @@ def _apply_update(instance, session, new_version, old_version): # noqa: C901
elif new_version == 23:
# Add name column to StatisticsMeta
_add_columns(session, "statistics_meta", ["name VARCHAR(255)"])
elif new_version == 24:
# Delete duplicated statistics
delete_duplicates(instance, session)
# Recreate statistics indices to block duplicated statistics
_drop_index(connection, "statistics", "ix_statistics_statistic_id_start")
_create_index(connection, "statistics", "ix_statistics_statistic_id_start")
_drop_index(
connection,
"statistics_short_term",
"ix_statistics_short_term_statistic_id_start",
)
_create_index(
connection,
"statistics_short_term",
"ix_statistics_short_term_statistic_id_start",
)
else:
raise ValueError(f"No schema migration defined for version {new_version}")

View file

@ -40,7 +40,7 @@ import homeassistant.util.dt as dt_util
# pylint: disable=invalid-name
Base = declarative_base()
SCHEMA_VERSION = 23
SCHEMA_VERSION = 24
_LOGGER = logging.getLogger(__name__)
@ -289,7 +289,7 @@ class Statistics(Base, StatisticsBase): # type: ignore
__table_args__ = (
# Used for fetching statistics for a certain entity at a specific time
Index("ix_statistics_statistic_id_start", "metadata_id", "start"),
Index("ix_statistics_statistic_id_start", "metadata_id", "start", unique=True),
)
__tablename__ = TABLE_STATISTICS
@ -301,7 +301,12 @@ class StatisticsShortTerm(Base, StatisticsBase): # type: ignore
__table_args__ = (
# Used for fetching statistics for a certain entity at a specific time
Index("ix_statistics_short_term_statistic_id_start", "metadata_id", "start"),
Index(
"ix_statistics_short_term_statistic_id_start",
"metadata_id",
"start",
unique=True,
),
)
__tablename__ = TABLE_STATISTICS_SHORT_TERM

View file

@ -3,19 +3,21 @@ from __future__ import annotations
from collections import defaultdict
from collections.abc import Callable, Iterable
import contextlib
import dataclasses
from datetime import datetime, timedelta
from itertools import chain, groupby
import json
import logging
import re
from statistics import mean
from typing import TYPE_CHECKING, Any, Literal
from sqlalchemy import bindparam, func
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.exc import SQLAlchemyError, StatementError
from sqlalchemy.ext import baked
from sqlalchemy.orm.scoping import scoped_session
from sqlalchemy.sql.expression import true
from sqlalchemy.sql.expression import literal_column, true
from homeassistant.const import (
PRESSURE_PA,
@ -26,13 +28,14 @@ from homeassistant.const import (
from homeassistant.core import Event, HomeAssistant, callback
from homeassistant.exceptions import HomeAssistantError
from homeassistant.helpers import entity_registry
from homeassistant.helpers.json import JSONEncoder
import homeassistant.util.dt as dt_util
import homeassistant.util.pressure as pressure_util
import homeassistant.util.temperature as temperature_util
from homeassistant.util.unit_system import UnitSystem
import homeassistant.util.volume as volume_util
from .const import DATA_INSTANCE, DOMAIN
from .const import DATA_INSTANCE, DOMAIN, MAX_ROWS_TO_PURGE
from .models import (
StatisticData,
StatisticMetaData,
@ -114,6 +117,8 @@ QUERY_STATISTIC_META_ID = [
StatisticsMeta.statistic_id,
]
MAX_DUPLICATES = 1000000
STATISTICS_BAKERY = "recorder_statistics_bakery"
STATISTICS_META_BAKERY = "recorder_statistics_meta_bakery"
STATISTICS_SHORT_TERM_BAKERY = "recorder_statistics_short_term_bakery"
@ -262,6 +267,139 @@ def _update_or_add_metadata(
return metadata_id
def _find_duplicates(
session: scoped_session, table: type[Statistics | StatisticsShortTerm]
) -> tuple[list[int], list[dict]]:
"""Find duplicated statistics."""
subquery = (
session.query(
table.start,
table.metadata_id,
literal_column("1").label("is_duplicate"),
)
.group_by(table.metadata_id, table.start)
.having(func.count() > 1)
.subquery()
)
query = (
session.query(table)
.outerjoin(
subquery,
(subquery.c.metadata_id == table.metadata_id)
& (subquery.c.start == table.start),
)
.filter(subquery.c.is_duplicate == 1)
.order_by(table.metadata_id, table.start, table.id.desc())
.limit(MAX_ROWS_TO_PURGE)
)
duplicates = execute(query)
original_as_dict = {}
start = None
metadata_id = None
duplicate_ids: list[int] = []
non_identical_duplicates_as_dict: list[dict] = []
if not duplicates:
return (duplicate_ids, non_identical_duplicates_as_dict)
def columns_to_dict(duplicate: type[Statistics | StatisticsShortTerm]) -> dict:
"""Convert a SQLAlchemy row to dict."""
dict_ = {}
for key in duplicate.__mapper__.c.keys():
dict_[key] = getattr(duplicate, key)
return dict_
def compare_statistic_rows(row1: dict, row2: dict) -> bool:
"""Compare two statistics rows, ignoring id and created."""
ignore_keys = ["id", "created"]
keys1 = set(row1).difference(ignore_keys)
keys2 = set(row2).difference(ignore_keys)
return keys1 == keys2 and all(row1[k] == row2[k] for k in keys1)
for duplicate in duplicates:
if start != duplicate.start or metadata_id != duplicate.metadata_id:
original_as_dict = columns_to_dict(duplicate)
start = duplicate.start
metadata_id = duplicate.metadata_id
continue
duplicate_as_dict = columns_to_dict(duplicate)
duplicate_ids.append(duplicate.id)
if not compare_statistic_rows(original_as_dict, duplicate_as_dict):
non_identical_duplicates_as_dict.append(duplicate_as_dict)
return (duplicate_ids, non_identical_duplicates_as_dict)
def _delete_duplicates_from_table(
session: scoped_session, table: type[Statistics | StatisticsShortTerm]
) -> tuple[int, list[dict]]:
"""Identify and delete duplicated statistics from a specified table."""
all_non_identical_duplicates: list[dict] = []
total_deleted_rows = 0
while True:
duplicate_ids, non_identical_duplicates = _find_duplicates(session, table)
if not duplicate_ids:
break
all_non_identical_duplicates.extend(non_identical_duplicates)
deleted_rows = (
session.query(table)
.filter(table.id.in_(duplicate_ids))
.delete(synchronize_session=False)
)
total_deleted_rows += deleted_rows
if total_deleted_rows >= MAX_DUPLICATES:
break
return (total_deleted_rows, all_non_identical_duplicates)
def delete_duplicates(instance: Recorder, session: scoped_session) -> None:
"""Identify and delete duplicated statistics.
A backup will be made of duplicated statistics before it is deleted.
"""
deleted_statistics_rows, non_identical_duplicates = _delete_duplicates_from_table(
session, Statistics
)
if deleted_statistics_rows:
_LOGGER.info("Deleted %s duplicated statistics rows", deleted_statistics_rows)
if non_identical_duplicates:
isotime = dt_util.utcnow().isoformat()
backup_file_name = f"deleted_statistics.{isotime}.json"
backup_path = instance.hass.config.path(backup_file_name)
with open(backup_path, "w", encoding="utf8") as backup_file:
json.dump(
non_identical_duplicates,
backup_file,
indent=4,
sort_keys=True,
cls=JSONEncoder,
)
_LOGGER.warning(
"Deleted %s non identical duplicated %s rows, a backup of the deleted rows "
"has been saved to %s",
len(non_identical_duplicates),
Statistics.__tablename__,
backup_path,
)
if deleted_statistics_rows >= MAX_DUPLICATES:
_LOGGER.warning(
"Found more than %s duplicated statistic rows, please report at "
'https://github.com/home-assistant/core/issues?q=is%%3Aissue+label%%3A"integration%%3A+recorder"+',
MAX_DUPLICATES - 1,
)
deleted_short_term_statistics_rows, _ = _delete_duplicates_from_table(
session, StatisticsShortTerm
)
if deleted_short_term_statistics_rows:
_LOGGER.warning(
"Deleted duplicated short term statistic rows, please report at "
'https://github.com/home-assistant/core/issues?q=is%%3Aissue+label%%3A"integration%%3A+recorder"+'
)
def compile_hourly_statistics(
instance: Recorder, session: scoped_session, start: datetime
) -> None:
@ -411,7 +549,10 @@ def compile_statistics(instance: Recorder, start: datetime) -> bool:
platform_stats.extend(platform_stat)
# Insert collected statistics in the database
with session_scope(session=instance.get_session()) as session: # type: ignore
with session_scope(
session=instance.get_session(), # type: ignore
exception_filter=_filter_unique_constraint_integrity_error(instance),
) as session:
for stats in platform_stats:
metadata_id = _update_or_add_metadata(instance.hass, session, stats["meta"])
_insert_statistics(
@ -1066,6 +1207,43 @@ def async_add_external_statistics(
hass.data[DATA_INSTANCE].async_external_statistics(metadata, statistics)
def _filter_unique_constraint_integrity_error(
instance: Recorder,
) -> Callable[[Exception], bool]:
def _filter_unique_constraint_integrity_error(err: Exception) -> bool:
"""Handle unique constraint integrity errors."""
if not isinstance(err, StatementError):
return False
ignore = False
if (
instance.engine.dialect.name == "sqlite"
and "UNIQUE constraint failed" in str(err)
):
ignore = True
if (
instance.engine.dialect.name == "postgresql"
and hasattr(err.orig, "pgcode")
and err.orig.pgcode == "23505"
):
ignore = True
if instance.engine.dialect.name == "mysql" and hasattr(err.orig, "args"):
with contextlib.suppress(TypeError):
if err.orig.args[0] == 1062:
ignore = True
if ignore:
_LOGGER.warning(
"Blocked attempt to insert duplicated statistic rows, please report at "
'https://github.com/home-assistant/core/issues?q=is%%3Aissue+label%%3A"integration%%3A+recorder"+',
exc_info=err,
)
return ignore
return _filter_unique_constraint_integrity_error
@retryable_database_job("statistics")
def add_external_statistics(
instance: Recorder,
@ -1073,7 +1251,11 @@ def add_external_statistics(
statistics: Iterable[StatisticData],
) -> bool:
"""Process an add_statistics job."""
with session_scope(session=instance.get_session()) as session: # type: ignore
with session_scope(
session=instance.get_session(), # type: ignore
exception_filter=_filter_unique_constraint_integrity_error(instance),
) as session:
metadata_id = _update_or_add_metadata(instance.hass, session, metadata)
for stat in statistics:
if stat_id := _statistics_exists(

View file

@ -66,7 +66,10 @@ RETRYABLE_MYSQL_ERRORS = (1205, 1206, 1213)
@contextmanager
def session_scope(
*, hass: HomeAssistant | None = None, session: Session | None = None
*,
hass: HomeAssistant | None = None,
session: Session | None = None,
exception_filter: Callable[[Exception], bool] | None = None,
) -> Generator[Session, None, None]:
"""Provide a transactional scope around a series of operations."""
if session is None and hass is not None:
@ -81,11 +84,12 @@ def session_scope(
if session.get_transaction():
need_rollback = True
session.commit()
except Exception as err:
except Exception as err: # pylint: disable=broad-except
_LOGGER.error("Error executing query: %s", err)
if need_rollback:
session.rollback()
raise
if not exception_filter or not exception_filter(err):
raise
finally:
session.close()

View file

@ -0,0 +1,582 @@
"""Models for SQLAlchemy.
This file contains the model definitions for schema version 23,
used by Home Assistant Core 2021.11.0, which adds the name column
to statistics_meta.
It is used to test the schema migration logic.
"""
from __future__ import annotations
from datetime import datetime, timedelta
import json
import logging
from typing import TypedDict, overload
from sqlalchemy import (
Boolean,
Column,
DateTime,
Float,
ForeignKey,
Identity,
Index,
Integer,
String,
Text,
distinct,
)
from sqlalchemy.dialects import mysql, oracle, postgresql
from sqlalchemy.ext.declarative import declared_attr
from sqlalchemy.orm import declarative_base, relationship
from sqlalchemy.orm.session import Session
from homeassistant.const import (
MAX_LENGTH_EVENT_CONTEXT_ID,
MAX_LENGTH_EVENT_EVENT_TYPE,
MAX_LENGTH_EVENT_ORIGIN,
MAX_LENGTH_STATE_DOMAIN,
MAX_LENGTH_STATE_ENTITY_ID,
MAX_LENGTH_STATE_STATE,
)
from homeassistant.core import Context, Event, EventOrigin, State, split_entity_id
from homeassistant.helpers.json import JSONEncoder
import homeassistant.util.dt as dt_util
# SQLAlchemy Schema
# pylint: disable=invalid-name
Base = declarative_base()
SCHEMA_VERSION = 23
_LOGGER = logging.getLogger(__name__)
DB_TIMEZONE = "+00:00"
TABLE_EVENTS = "events"
TABLE_STATES = "states"
TABLE_RECORDER_RUNS = "recorder_runs"
TABLE_SCHEMA_CHANGES = "schema_changes"
TABLE_STATISTICS = "statistics"
TABLE_STATISTICS_META = "statistics_meta"
TABLE_STATISTICS_RUNS = "statistics_runs"
TABLE_STATISTICS_SHORT_TERM = "statistics_short_term"
ALL_TABLES = [
TABLE_STATES,
TABLE_EVENTS,
TABLE_RECORDER_RUNS,
TABLE_SCHEMA_CHANGES,
TABLE_STATISTICS,
TABLE_STATISTICS_META,
TABLE_STATISTICS_RUNS,
TABLE_STATISTICS_SHORT_TERM,
]
DATETIME_TYPE = DateTime(timezone=True).with_variant(
mysql.DATETIME(timezone=True, fsp=6), "mysql"
)
DOUBLE_TYPE = (
Float()
.with_variant(mysql.DOUBLE(asdecimal=False), "mysql")
.with_variant(oracle.DOUBLE_PRECISION(), "oracle")
.with_variant(postgresql.DOUBLE_PRECISION(), "postgresql")
)
class Events(Base): # type: ignore
"""Event history data."""
__table_args__ = (
# Used for fetching events at a specific time
# see logbook
Index("ix_events_event_type_time_fired", "event_type", "time_fired"),
{"mysql_default_charset": "utf8mb4", "mysql_collate": "utf8mb4_unicode_ci"},
)
__tablename__ = TABLE_EVENTS
event_id = Column(Integer, Identity(), primary_key=True)
event_type = Column(String(MAX_LENGTH_EVENT_EVENT_TYPE))
event_data = Column(Text().with_variant(mysql.LONGTEXT, "mysql"))
origin = Column(String(MAX_LENGTH_EVENT_ORIGIN))
time_fired = Column(DATETIME_TYPE, index=True)
created = Column(DATETIME_TYPE, default=dt_util.utcnow)
context_id = Column(String(MAX_LENGTH_EVENT_CONTEXT_ID), index=True)
context_user_id = Column(String(MAX_LENGTH_EVENT_CONTEXT_ID), index=True)
context_parent_id = Column(String(MAX_LENGTH_EVENT_CONTEXT_ID), index=True)
def __repr__(self) -> str:
"""Return string representation of instance for debugging."""
return (
f"<recorder.Events("
f"id={self.event_id}, type='{self.event_type}', data='{self.event_data}', "
f"origin='{self.origin}', time_fired='{self.time_fired}'"
f")>"
)
@staticmethod
def from_event(event, event_data=None):
"""Create an event database object from a native event."""
return Events(
event_type=event.event_type,
event_data=event_data
or json.dumps(event.data, cls=JSONEncoder, separators=(",", ":")),
origin=str(event.origin.value),
time_fired=event.time_fired,
context_id=event.context.id,
context_user_id=event.context.user_id,
context_parent_id=event.context.parent_id,
)
def to_native(self, validate_entity_id=True):
"""Convert to a native HA Event."""
context = Context(
id=self.context_id,
user_id=self.context_user_id,
parent_id=self.context_parent_id,
)
try:
return Event(
self.event_type,
json.loads(self.event_data),
EventOrigin(self.origin),
process_timestamp(self.time_fired),
context=context,
)
except ValueError:
# When json.loads fails
_LOGGER.exception("Error converting to event: %s", self)
return None
class States(Base): # type: ignore
"""State change history."""
__table_args__ = (
# Used for fetching the state of entities at a specific time
# (get_states in history.py)
Index("ix_states_entity_id_last_updated", "entity_id", "last_updated"),
{"mysql_default_charset": "utf8mb4", "mysql_collate": "utf8mb4_unicode_ci"},
)
__tablename__ = TABLE_STATES
state_id = Column(Integer, Identity(), primary_key=True)
domain = Column(String(MAX_LENGTH_STATE_DOMAIN))
entity_id = Column(String(MAX_LENGTH_STATE_ENTITY_ID))
state = Column(String(MAX_LENGTH_STATE_STATE))
attributes = Column(Text().with_variant(mysql.LONGTEXT, "mysql"))
event_id = Column(
Integer, ForeignKey("events.event_id", ondelete="CASCADE"), index=True
)
last_changed = Column(DATETIME_TYPE, default=dt_util.utcnow)
last_updated = Column(DATETIME_TYPE, default=dt_util.utcnow, index=True)
created = Column(DATETIME_TYPE, default=dt_util.utcnow)
old_state_id = Column(Integer, ForeignKey("states.state_id"), index=True)
event = relationship("Events", uselist=False)
old_state = relationship("States", remote_side=[state_id])
def __repr__(self) -> str:
"""Return string representation of instance for debugging."""
return (
f"<recorder.States("
f"id={self.state_id}, domain='{self.domain}', entity_id='{self.entity_id}', "
f"state='{self.state}', event_id='{self.event_id}', "
f"last_updated='{self.last_updated.isoformat(sep=' ', timespec='seconds')}', "
f"old_state_id={self.old_state_id}"
f")>"
)
@staticmethod
def from_event(event):
"""Create object from a state_changed event."""
entity_id = event.data["entity_id"]
state = event.data.get("new_state")
dbstate = States(entity_id=entity_id)
# State got deleted
if state is None:
dbstate.state = ""
dbstate.domain = split_entity_id(entity_id)[0]
dbstate.attributes = "{}"
dbstate.last_changed = event.time_fired
dbstate.last_updated = event.time_fired
else:
dbstate.domain = state.domain
dbstate.state = state.state
dbstate.attributes = json.dumps(
dict(state.attributes), cls=JSONEncoder, separators=(",", ":")
)
dbstate.last_changed = state.last_changed
dbstate.last_updated = state.last_updated
return dbstate
def to_native(self, validate_entity_id=True):
"""Convert to an HA state object."""
try:
return State(
self.entity_id,
self.state,
json.loads(self.attributes),
process_timestamp(self.last_changed),
process_timestamp(self.last_updated),
# Join the events table on event_id to get the context instead
# as it will always be there for state_changed events
context=Context(id=None),
validate_entity_id=validate_entity_id,
)
except ValueError:
# When json.loads fails
_LOGGER.exception("Error converting row to state: %s", self)
return None
class StatisticResult(TypedDict):
"""Statistic result data class.
Allows multiple datapoints for the same statistic_id.
"""
meta: StatisticMetaData
stat: StatisticData
class StatisticDataBase(TypedDict):
"""Mandatory fields for statistic data class."""
start: datetime
class StatisticData(StatisticDataBase, total=False):
"""Statistic data class."""
mean: float
min: float
max: float
last_reset: datetime | None
state: float
sum: float
class StatisticsBase:
"""Statistics base class."""
id = Column(Integer, Identity(), primary_key=True)
created = Column(DATETIME_TYPE, default=dt_util.utcnow)
@declared_attr
def metadata_id(self):
"""Define the metadata_id column for sub classes."""
return Column(
Integer,
ForeignKey(f"{TABLE_STATISTICS_META}.id", ondelete="CASCADE"),
index=True,
)
start = Column(DATETIME_TYPE, index=True)
mean = Column(DOUBLE_TYPE)
min = Column(DOUBLE_TYPE)
max = Column(DOUBLE_TYPE)
last_reset = Column(DATETIME_TYPE)
state = Column(DOUBLE_TYPE)
sum = Column(DOUBLE_TYPE)
@classmethod
def from_stats(cls, metadata_id: int, stats: StatisticData):
"""Create object from a statistics."""
return cls( # type: ignore
metadata_id=metadata_id,
**stats,
)
class Statistics(Base, StatisticsBase): # type: ignore
"""Long term statistics."""
duration = timedelta(hours=1)
__table_args__ = (
# Used for fetching statistics for a certain entity at a specific time
Index("ix_statistics_statistic_id_start", "metadata_id", "start"),
)
__tablename__ = TABLE_STATISTICS
class StatisticsShortTerm(Base, StatisticsBase): # type: ignore
"""Short term statistics."""
duration = timedelta(minutes=5)
__table_args__ = (
# Used for fetching statistics for a certain entity at a specific time
Index("ix_statistics_short_term_statistic_id_start", "metadata_id", "start"),
)
__tablename__ = TABLE_STATISTICS_SHORT_TERM
class StatisticMetaData(TypedDict):
"""Statistic meta data class."""
has_mean: bool
has_sum: bool
name: str | None
source: str
statistic_id: str
unit_of_measurement: str | None
class StatisticsMeta(Base): # type: ignore
"""Statistics meta data."""
__table_args__ = (
{"mysql_default_charset": "utf8mb4", "mysql_collate": "utf8mb4_unicode_ci"},
)
__tablename__ = TABLE_STATISTICS_META
id = Column(Integer, Identity(), primary_key=True)
statistic_id = Column(String(255), index=True)
source = Column(String(32))
unit_of_measurement = Column(String(255))
has_mean = Column(Boolean)
has_sum = Column(Boolean)
name = Column(String(255))
@staticmethod
def from_meta(meta: StatisticMetaData) -> StatisticsMeta:
"""Create object from meta data."""
return StatisticsMeta(**meta)
class RecorderRuns(Base): # type: ignore
"""Representation of recorder run."""
__table_args__ = (Index("ix_recorder_runs_start_end", "start", "end"),)
__tablename__ = TABLE_RECORDER_RUNS
run_id = Column(Integer, Identity(), primary_key=True)
start = Column(DateTime(timezone=True), default=dt_util.utcnow)
end = Column(DateTime(timezone=True))
closed_incorrect = Column(Boolean, default=False)
created = Column(DateTime(timezone=True), default=dt_util.utcnow)
def __repr__(self) -> str:
"""Return string representation of instance for debugging."""
end = (
f"'{self.end.isoformat(sep=' ', timespec='seconds')}'" if self.end else None
)
return (
f"<recorder.RecorderRuns("
f"id={self.run_id}, start='{self.start.isoformat(sep=' ', timespec='seconds')}', "
f"end={end}, closed_incorrect={self.closed_incorrect}, "
f"created='{self.created.isoformat(sep=' ', timespec='seconds')}'"
f")>"
)
def entity_ids(self, point_in_time=None):
"""Return the entity ids that existed in this run.
Specify point_in_time if you want to know which existed at that point
in time inside the run.
"""
session = Session.object_session(self)
assert session is not None, "RecorderRuns need to be persisted"
query = session.query(distinct(States.entity_id)).filter(
States.last_updated >= self.start
)
if point_in_time is not None:
query = query.filter(States.last_updated < point_in_time)
elif self.end is not None:
query = query.filter(States.last_updated < self.end)
return [row[0] for row in query]
def to_native(self, validate_entity_id=True):
"""Return self, native format is this model."""
return self
class SchemaChanges(Base): # type: ignore
"""Representation of schema version changes."""
__tablename__ = TABLE_SCHEMA_CHANGES
change_id = Column(Integer, Identity(), primary_key=True)
schema_version = Column(Integer)
changed = Column(DateTime(timezone=True), default=dt_util.utcnow)
def __repr__(self) -> str:
"""Return string representation of instance for debugging."""
return (
f"<recorder.SchemaChanges("
f"id={self.change_id}, schema_version={self.schema_version}, "
f"changed='{self.changed.isoformat(sep=' ', timespec='seconds')}'"
f")>"
)
class StatisticsRuns(Base): # type: ignore
"""Representation of statistics run."""
__tablename__ = TABLE_STATISTICS_RUNS
run_id = Column(Integer, Identity(), primary_key=True)
start = Column(DateTime(timezone=True))
def __repr__(self) -> str:
"""Return string representation of instance for debugging."""
return (
f"<recorder.StatisticsRuns("
f"id={self.run_id}, start='{self.start.isoformat(sep=' ', timespec='seconds')}', "
f")>"
)
@overload
def process_timestamp(ts: None) -> None:
...
@overload
def process_timestamp(ts: datetime) -> datetime:
...
def process_timestamp(ts: datetime | None) -> datetime | None:
"""Process a timestamp into datetime object."""
if ts is None:
return None
if ts.tzinfo is None:
return ts.replace(tzinfo=dt_util.UTC)
return dt_util.as_utc(ts)
@overload
def process_timestamp_to_utc_isoformat(ts: None) -> None:
...
@overload
def process_timestamp_to_utc_isoformat(ts: datetime) -> str:
...
def process_timestamp_to_utc_isoformat(ts: datetime | None) -> str | None:
"""Process a timestamp into UTC isotime."""
if ts is None:
return None
if ts.tzinfo == dt_util.UTC:
return ts.isoformat()
if ts.tzinfo is None:
return f"{ts.isoformat()}{DB_TIMEZONE}"
return ts.astimezone(dt_util.UTC).isoformat()
class LazyState(State):
"""A lazy version of core State."""
__slots__ = [
"_row",
"entity_id",
"state",
"_attributes",
"_last_changed",
"_last_updated",
"_context",
]
def __init__(self, row): # pylint: disable=super-init-not-called
"""Init the lazy state."""
self._row = row
self.entity_id = self._row.entity_id
self.state = self._row.state or ""
self._attributes = None
self._last_changed = None
self._last_updated = None
self._context = None
@property # type: ignore
def attributes(self):
"""State attributes."""
if not self._attributes:
try:
self._attributes = json.loads(self._row.attributes)
except ValueError:
# When json.loads fails
_LOGGER.exception("Error converting row to state: %s", self._row)
self._attributes = {}
return self._attributes
@attributes.setter
def attributes(self, value):
"""Set attributes."""
self._attributes = value
@property # type: ignore
def context(self):
"""State context."""
if not self._context:
self._context = Context(id=None)
return self._context
@context.setter
def context(self, value):
"""Set context."""
self._context = value
@property # type: ignore
def last_changed(self):
"""Last changed datetime."""
if not self._last_changed:
self._last_changed = process_timestamp(self._row.last_changed)
return self._last_changed
@last_changed.setter
def last_changed(self, value):
"""Set last changed datetime."""
self._last_changed = value
@property # type: ignore
def last_updated(self):
"""Last updated datetime."""
if not self._last_updated:
self._last_updated = process_timestamp(self._row.last_updated)
return self._last_updated
@last_updated.setter
def last_updated(self, value):
"""Set last updated datetime."""
self._last_updated = value
def as_dict(self):
"""Return a dict representation of the LazyState.
Async friendly.
To be used for JSON serialization.
"""
if self._last_changed:
last_changed_isoformat = self._last_changed.isoformat()
else:
last_changed_isoformat = process_timestamp_to_utc_isoformat(
self._row.last_changed
)
if self._last_updated:
last_updated_isoformat = self._last_updated.isoformat()
else:
last_updated_isoformat = process_timestamp_to_utc_isoformat(
self._row.last_updated
)
return {
"entity_id": self.entity_id,
"state": self.state,
"attributes": self._attributes or self.attributes,
"last_changed": last_changed_isoformat,
"last_updated": last_updated_isoformat,
}
def __eq__(self, other):
"""Return the comparison."""
return (
other.__class__ in [self.__class__, State]
and self.entity_id == other.entity_id
and self.state == other.state
and self.attributes == other.attributes
)

View file

@ -1,12 +1,18 @@
"""The tests for sensor recorder platform."""
# pylint: disable=protected-access,invalid-name
from datetime import timedelta
import importlib
import json
import sys
from unittest.mock import patch, sentinel
import pytest
from pytest import approx
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from homeassistant.components.recorder import history
from homeassistant.components import recorder
from homeassistant.components.recorder import SQLITE_URL_PREFIX, history, statistics
from homeassistant.components.recorder.const import DATA_INSTANCE
from homeassistant.components.recorder.models import (
StatisticsShortTerm,
@ -14,18 +20,20 @@ from homeassistant.components.recorder.models import (
)
from homeassistant.components.recorder.statistics import (
async_add_external_statistics,
delete_duplicates,
get_last_short_term_statistics,
get_last_statistics,
get_metadata,
list_statistic_ids,
statistics_during_period,
)
from homeassistant.components.recorder.util import session_scope
from homeassistant.const import TEMP_CELSIUS
from homeassistant.exceptions import HomeAssistantError
from homeassistant.setup import setup_component
import homeassistant.util.dt as dt_util
from tests.common import mock_registry
from tests.common import get_test_home_assistant, mock_registry
from tests.components.recorder.common import wait_recording_done
@ -650,6 +658,624 @@ def test_monthly_statistics(hass_recorder, caplog, timezone):
dt_util.set_default_time_zone(dt_util.get_time_zone("UTC"))
def _create_engine_test(*args, **kwargs):
"""Test version of create_engine that initializes with old schema.
This simulates an existing db with the old schema.
"""
module = "tests.components.recorder.models_schema_23"
importlib.import_module(module)
old_models = sys.modules[module]
engine = create_engine(*args, **kwargs)
old_models.Base.metadata.create_all(engine)
with Session(engine) as session:
session.add(recorder.models.StatisticsRuns(start=statistics.get_start_time()))
session.add(
recorder.models.SchemaChanges(schema_version=old_models.SCHEMA_VERSION)
)
session.commit()
return engine
def test_delete_duplicates(caplog, tmpdir):
"""Test removal of duplicated statistics."""
test_db_file = tmpdir.mkdir("sqlite").join("test_run_info.db")
dburl = f"{SQLITE_URL_PREFIX}//{test_db_file}"
module = "tests.components.recorder.models_schema_23"
importlib.import_module(module)
old_models = sys.modules[module]
period1 = dt_util.as_utc(dt_util.parse_datetime("2021-09-01 00:00:00"))
period2 = dt_util.as_utc(dt_util.parse_datetime("2021-09-30 23:00:00"))
period3 = dt_util.as_utc(dt_util.parse_datetime("2021-10-01 00:00:00"))
period4 = dt_util.as_utc(dt_util.parse_datetime("2021-10-31 23:00:00"))
external_energy_statistics_1 = (
{
"start": period1,
"last_reset": None,
"state": 0,
"sum": 2,
},
{
"start": period2,
"last_reset": None,
"state": 1,
"sum": 3,
},
{
"start": period3,
"last_reset": None,
"state": 2,
"sum": 4,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 5,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 5,
},
)
external_energy_metadata_1 = {
"has_mean": False,
"has_sum": True,
"name": "Total imported energy",
"source": "test",
"statistic_id": "test:total_energy_import_tariff_1",
"unit_of_measurement": "kWh",
}
external_energy_statistics_2 = (
{
"start": period1,
"last_reset": None,
"state": 0,
"sum": 20,
},
{
"start": period2,
"last_reset": None,
"state": 1,
"sum": 30,
},
{
"start": period3,
"last_reset": None,
"state": 2,
"sum": 40,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 50,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 50,
},
)
external_energy_metadata_2 = {
"has_mean": False,
"has_sum": True,
"name": "Total imported energy",
"source": "test",
"statistic_id": "test:total_energy_import_tariff_2",
"unit_of_measurement": "kWh",
}
external_co2_statistics = (
{
"start": period1,
"last_reset": None,
"mean": 10,
},
{
"start": period2,
"last_reset": None,
"mean": 30,
},
{
"start": period3,
"last_reset": None,
"mean": 60,
},
{
"start": period4,
"last_reset": None,
"mean": 90,
},
)
external_co2_metadata = {
"has_mean": True,
"has_sum": False,
"name": "Fossil percentage",
"source": "test",
"statistic_id": "test:fossil_percentage",
"unit_of_measurement": "%",
}
# Create some duplicated statistics with schema version 23
with patch.object(recorder, "models", old_models), patch.object(
recorder.migration, "SCHEMA_VERSION", old_models.SCHEMA_VERSION
), patch(
"homeassistant.components.recorder.create_engine", new=_create_engine_test
):
hass = get_test_home_assistant()
setup_component(hass, "recorder", {"recorder": {"db_url": dburl}})
wait_recording_done(hass)
wait_recording_done(hass)
with session_scope(hass=hass) as session:
session.add(
recorder.models.StatisticsMeta.from_meta(external_energy_metadata_1)
)
session.add(
recorder.models.StatisticsMeta.from_meta(external_energy_metadata_2)
)
session.add(recorder.models.StatisticsMeta.from_meta(external_co2_metadata))
with session_scope(hass=hass) as session:
for stat in external_energy_statistics_1:
session.add(recorder.models.Statistics.from_stats(1, stat))
for stat in external_energy_statistics_2:
session.add(recorder.models.Statistics.from_stats(2, stat))
for stat in external_co2_statistics:
session.add(recorder.models.Statistics.from_stats(3, stat))
hass.stop()
# Test that the duplicates are removed during migration from schema 23
hass = get_test_home_assistant()
setup_component(hass, "recorder", {"recorder": {"db_url": dburl}})
hass.start()
wait_recording_done(hass)
wait_recording_done(hass)
hass.stop()
assert "Deleted 2 duplicated statistics rows" in caplog.text
assert "Found non identical" not in caplog.text
assert "Found more than" not in caplog.text
assert "Found duplicated" not in caplog.text
@pytest.mark.freeze_time("2021-08-01 00:00:00+00:00")
def test_delete_duplicates_non_identical(caplog, tmpdir):
"""Test removal of duplicated statistics."""
test_db_file = tmpdir.mkdir("sqlite").join("test_run_info.db")
dburl = f"{SQLITE_URL_PREFIX}//{test_db_file}"
module = "tests.components.recorder.models_schema_23"
importlib.import_module(module)
old_models = sys.modules[module]
period1 = dt_util.as_utc(dt_util.parse_datetime("2021-09-01 00:00:00"))
period2 = dt_util.as_utc(dt_util.parse_datetime("2021-09-30 23:00:00"))
period3 = dt_util.as_utc(dt_util.parse_datetime("2021-10-01 00:00:00"))
period4 = dt_util.as_utc(dt_util.parse_datetime("2021-10-31 23:00:00"))
external_energy_statistics_1 = (
{
"start": period1,
"last_reset": None,
"state": 0,
"sum": 2,
},
{
"start": period2,
"last_reset": None,
"state": 1,
"sum": 3,
},
{
"start": period3,
"last_reset": None,
"state": 2,
"sum": 4,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 5,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 6,
},
)
external_energy_metadata_1 = {
"has_mean": False,
"has_sum": True,
"name": "Total imported energy",
"source": "test",
"statistic_id": "test:total_energy_import_tariff_1",
"unit_of_measurement": "kWh",
}
external_energy_statistics_2 = (
{
"start": period1,
"last_reset": None,
"state": 0,
"sum": 20,
},
{
"start": period2,
"last_reset": None,
"state": 1,
"sum": 30,
},
{
"start": period3,
"last_reset": None,
"state": 2,
"sum": 40,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 50,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 50,
},
)
external_energy_metadata_2 = {
"has_mean": False,
"has_sum": True,
"name": "Total imported energy",
"source": "test",
"statistic_id": "test:total_energy_import_tariff_2",
"unit_of_measurement": "kWh",
}
# Create some duplicated statistics with schema version 23
with patch.object(recorder, "models", old_models), patch.object(
recorder.migration, "SCHEMA_VERSION", old_models.SCHEMA_VERSION
), patch(
"homeassistant.components.recorder.create_engine", new=_create_engine_test
):
hass = get_test_home_assistant()
setup_component(hass, "recorder", {"recorder": {"db_url": dburl}})
wait_recording_done(hass)
wait_recording_done(hass)
with session_scope(hass=hass) as session:
session.add(
recorder.models.StatisticsMeta.from_meta(external_energy_metadata_1)
)
session.add(
recorder.models.StatisticsMeta.from_meta(external_energy_metadata_2)
)
with session_scope(hass=hass) as session:
for stat in external_energy_statistics_1:
session.add(recorder.models.Statistics.from_stats(1, stat))
for stat in external_energy_statistics_2:
session.add(recorder.models.Statistics.from_stats(2, stat))
hass.stop()
# Test that the duplicates are removed during migration from schema 23
hass = get_test_home_assistant()
hass.config.config_dir = tmpdir
setup_component(hass, "recorder", {"recorder": {"db_url": dburl}})
hass.start()
wait_recording_done(hass)
wait_recording_done(hass)
hass.stop()
assert "Deleted 2 duplicated statistics rows" in caplog.text
assert "Deleted 1 non identical" in caplog.text
assert "Found more than" not in caplog.text
assert "Found duplicated" not in caplog.text
isotime = dt_util.utcnow().isoformat()
backup_file_name = f"deleted_statistics.{isotime}.json"
with open(hass.config.path(backup_file_name)) as backup_file:
backup = json.load(backup_file)
assert backup == [
{
"created": "2021-08-01T00:00:00",
"id": 4,
"last_reset": None,
"max": None,
"mean": None,
"metadata_id": 1,
"min": None,
"start": "2021-10-31T23:00:00",
"state": 3.0,
"sum": 5.0,
}
]
@patch.object(statistics, "MAX_DUPLICATES", 2)
def test_delete_duplicates_too_many(caplog, tmpdir):
"""Test removal of duplicated statistics."""
test_db_file = tmpdir.mkdir("sqlite").join("test_run_info.db")
dburl = f"{SQLITE_URL_PREFIX}//{test_db_file}"
module = "tests.components.recorder.models_schema_23"
importlib.import_module(module)
old_models = sys.modules[module]
period1 = dt_util.as_utc(dt_util.parse_datetime("2021-09-01 00:00:00"))
period2 = dt_util.as_utc(dt_util.parse_datetime("2021-09-30 23:00:00"))
period3 = dt_util.as_utc(dt_util.parse_datetime("2021-10-01 00:00:00"))
period4 = dt_util.as_utc(dt_util.parse_datetime("2021-10-31 23:00:00"))
external_energy_statistics_1 = (
{
"start": period1,
"last_reset": None,
"state": 0,
"sum": 2,
},
{
"start": period2,
"last_reset": None,
"state": 1,
"sum": 3,
},
{
"start": period3,
"last_reset": None,
"state": 2,
"sum": 4,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 5,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 5,
},
)
external_energy_metadata_1 = {
"has_mean": False,
"has_sum": True,
"name": "Total imported energy",
"source": "test",
"statistic_id": "test:total_energy_import_tariff_1",
"unit_of_measurement": "kWh",
}
external_energy_statistics_2 = (
{
"start": period1,
"last_reset": None,
"state": 0,
"sum": 20,
},
{
"start": period2,
"last_reset": None,
"state": 1,
"sum": 30,
},
{
"start": period3,
"last_reset": None,
"state": 2,
"sum": 40,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 50,
},
{
"start": period4,
"last_reset": None,
"state": 3,
"sum": 50,
},
)
external_energy_metadata_2 = {
"has_mean": False,
"has_sum": True,
"name": "Total imported energy",
"source": "test",
"statistic_id": "test:total_energy_import_tariff_2",
"unit_of_measurement": "kWh",
}
# Create some duplicated statistics with schema version 23
with patch.object(recorder, "models", old_models), patch.object(
recorder.migration, "SCHEMA_VERSION", old_models.SCHEMA_VERSION
), patch(
"homeassistant.components.recorder.create_engine", new=_create_engine_test
):
hass = get_test_home_assistant()
setup_component(hass, "recorder", {"recorder": {"db_url": dburl}})
wait_recording_done(hass)
wait_recording_done(hass)
with session_scope(hass=hass) as session:
session.add(
recorder.models.StatisticsMeta.from_meta(external_energy_metadata_1)
)
session.add(
recorder.models.StatisticsMeta.from_meta(external_energy_metadata_2)
)
with session_scope(hass=hass) as session:
for stat in external_energy_statistics_1:
session.add(recorder.models.Statistics.from_stats(1, stat))
for stat in external_energy_statistics_2:
session.add(recorder.models.Statistics.from_stats(2, stat))
hass.stop()
# Test that the duplicates are removed during migration from schema 23
hass = get_test_home_assistant()
hass.config.config_dir = tmpdir
setup_component(hass, "recorder", {"recorder": {"db_url": dburl}})
hass.start()
wait_recording_done(hass)
wait_recording_done(hass)
hass.stop()
assert "Deleted 2 duplicated statistics rows" in caplog.text
assert "Found non identical" not in caplog.text
assert "Found more than 1 duplicated statistic rows" in caplog.text
assert "Found duplicated" not in caplog.text
@patch.object(statistics, "MAX_DUPLICATES", 2)
def test_delete_duplicates_short_term(caplog, tmpdir):
"""Test removal of duplicated statistics."""
test_db_file = tmpdir.mkdir("sqlite").join("test_run_info.db")
dburl = f"{SQLITE_URL_PREFIX}//{test_db_file}"
module = "tests.components.recorder.models_schema_23"
importlib.import_module(module)
old_models = sys.modules[module]
period4 = dt_util.as_utc(dt_util.parse_datetime("2021-10-31 23:00:00"))
external_energy_metadata_1 = {
"has_mean": False,
"has_sum": True,
"name": "Total imported energy",
"source": "test",
"statistic_id": "test:total_energy_import_tariff_1",
"unit_of_measurement": "kWh",
}
statistic_row = {
"start": period4,
"last_reset": None,
"state": 3,
"sum": 5,
}
# Create some duplicated statistics with schema version 23
with patch.object(recorder, "models", old_models), patch.object(
recorder.migration, "SCHEMA_VERSION", old_models.SCHEMA_VERSION
), patch(
"homeassistant.components.recorder.create_engine", new=_create_engine_test
):
hass = get_test_home_assistant()
setup_component(hass, "recorder", {"recorder": {"db_url": dburl}})
wait_recording_done(hass)
wait_recording_done(hass)
with session_scope(hass=hass) as session:
session.add(
recorder.models.StatisticsMeta.from_meta(external_energy_metadata_1)
)
with session_scope(hass=hass) as session:
session.add(
recorder.models.StatisticsShortTerm.from_stats(1, statistic_row)
)
session.add(
recorder.models.StatisticsShortTerm.from_stats(1, statistic_row)
)
hass.stop()
# Test that the duplicates are removed during migration from schema 23
hass = get_test_home_assistant()
hass.config.config_dir = tmpdir
setup_component(hass, "recorder", {"recorder": {"db_url": dburl}})
hass.start()
wait_recording_done(hass)
wait_recording_done(hass)
hass.stop()
assert "duplicated statistics rows" not in caplog.text
assert "Found non identical" not in caplog.text
assert "Found more than" not in caplog.text
assert "Deleted duplicated short term statistic" in caplog.text
def test_delete_duplicates_no_duplicates(hass_recorder, caplog):
"""Test removal of duplicated statistics."""
hass = hass_recorder()
wait_recording_done(hass)
with session_scope(hass=hass) as session:
delete_duplicates(hass.data[DATA_INSTANCE], session)
assert "duplicated statistics rows" not in caplog.text
assert "Found non identical" not in caplog.text
assert "Found more than" not in caplog.text
assert "Found duplicated" not in caplog.text
def test_duplicate_statistics_handle_integrity_error(hass_recorder, caplog):
"""Test the recorder does not blow up if statistics is duplicated."""
hass = hass_recorder()
wait_recording_done(hass)
period1 = dt_util.as_utc(dt_util.parse_datetime("2021-09-01 00:00:00"))
period2 = dt_util.as_utc(dt_util.parse_datetime("2021-09-30 23:00:00"))
external_energy_metadata_1 = {
"has_mean": False,
"has_sum": True,
"name": "Total imported energy",
"source": "test",
"statistic_id": "test:total_energy_import_tariff_1",
"unit_of_measurement": "kWh",
}
external_energy_statistics_1 = [
{
"start": period1,
"last_reset": None,
"state": 3,
"sum": 5,
},
]
external_energy_statistics_2 = [
{
"start": period2,
"last_reset": None,
"state": 3,
"sum": 6,
}
]
with patch.object(
statistics, "_statistics_exists", return_value=False
), patch.object(
statistics, "_insert_statistics", wraps=statistics._insert_statistics
) as insert_statistics_mock:
async_add_external_statistics(
hass, external_energy_metadata_1, external_energy_statistics_1
)
async_add_external_statistics(
hass, external_energy_metadata_1, external_energy_statistics_1
)
async_add_external_statistics(
hass, external_energy_metadata_1, external_energy_statistics_2
)
wait_recording_done(hass)
assert insert_statistics_mock.call_count == 3
with session_scope(hass=hass) as session:
tmp = session.query(recorder.models.Statistics).all()
assert len(tmp) == 2
assert "Blocked attempt to insert duplicated statistic rows" in caplog.text
def record_states(hass):
"""Record some test states.