Add object source logger to profiler (#90650)

* Add object source logger to profiler

* fixes

* cleanup

* tweaks

* logging

* logging

* too intensive

* adjust

* Update homeassistant/bootstrap.py

* fixes

* fixes

* coverage
This commit is contained in:
J. Nick Koston 2023-04-02 14:54:21 -10:00 committed by GitHub
parent 2229a63acd
commit 51ff027fce
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 314 additions and 30 deletions

View file

@ -17,7 +17,7 @@ import voluptuous as vol
from homeassistant.components import persistent_notification
from homeassistant.config_entries import ConfigEntry
from homeassistant.const import CONF_SCAN_INTERVAL, CONF_TYPE
from homeassistant.core import HomeAssistant, ServiceCall
from homeassistant.core import HomeAssistant, ServiceCall, callback
from homeassistant.exceptions import HomeAssistantError
import homeassistant.helpers.config_validation as cv
from homeassistant.helpers.event import async_track_time_interval
@ -29,6 +29,8 @@ SERVICE_START = "start"
SERVICE_MEMORY = "memory"
SERVICE_START_LOG_OBJECTS = "start_log_objects"
SERVICE_STOP_LOG_OBJECTS = "stop_log_objects"
SERVICE_START_LOG_OBJECT_SOURCES = "start_log_object_sources"
SERVICE_STOP_LOG_OBJECT_SOURCES = "stop_log_object_sources"
SERVICE_DUMP_LOG_OBJECTS = "dump_log_objects"
SERVICE_LRU_STATS = "lru_stats"
SERVICE_LOG_THREAD_FRAMES = "log_thread_frames"
@ -60,7 +62,10 @@ SERVICES = (
DEFAULT_SCAN_INTERVAL = timedelta(seconds=30)
DEFAULT_MAX_OBJECTS = 5
CONF_SECONDS = "seconds"
CONF_MAX_OBJECTS = "max_objects"
LOG_INTERVAL_SUB = "log_interval_subscription"
@ -85,7 +90,7 @@ async def async_setup_entry( # noqa: C901
async def _async_start_log_objects(call: ServiceCall) -> None:
if LOG_INTERVAL_SUB in domain_data:
domain_data[LOG_INTERVAL_SUB]()
raise HomeAssistantError("Object logging already started")
persistent_notification.async_create(
hass,
@ -103,21 +108,53 @@ async def async_setup_entry( # noqa: C901
async def _async_stop_log_objects(call: ServiceCall) -> None:
if LOG_INTERVAL_SUB not in domain_data:
return
raise HomeAssistantError("Object logging not running")
persistent_notification.async_dismiss(hass, "profile_object_logging")
domain_data.pop(LOG_INTERVAL_SUB)()
def _safe_repr(obj: Any) -> str:
"""Get the repr of an object but keep going if there is an exception.
async def _async_start_object_sources(call: ServiceCall) -> None:
if LOG_INTERVAL_SUB in domain_data:
raise HomeAssistantError("Object logging already started")
We wrap repr to ensure if one object cannot be serialized, we can
still get the rest.
"""
try:
return repr(obj)
except Exception: # pylint: disable=broad-except
return f"Failed to serialize {type(obj)}"
persistent_notification.async_create(
hass,
(
"Object source logging has started. See [the logs](/config/logs) to"
" track the growth of new objects."
),
title="Object source logging started",
notification_id="profile_object_source_logging",
)
last_ids: set[int] = set()
last_stats: dict[str, int] = {}
async def _log_object_sources_with_max(*_: Any) -> None:
await hass.async_add_executor_job(
_log_object_sources, call.data[CONF_MAX_OBJECTS], last_ids, last_stats
)
await _log_object_sources_with_max()
cancel_track = async_track_time_interval(
hass, _log_object_sources_with_max, call.data[CONF_SCAN_INTERVAL]
)
@callback
def _cancel():
cancel_track()
last_ids.clear()
last_stats.clear()
domain_data[LOG_INTERVAL_SUB] = _cancel
@callback
def _async_stop_object_sources(call: ServiceCall) -> None:
if LOG_INTERVAL_SUB not in domain_data:
raise HomeAssistantError("Object logging not running")
persistent_notification.async_dismiss(hass, "profile_object_source_logging")
domain_data.pop(LOG_INTERVAL_SUB)()
def _dump_log_objects(call: ServiceCall) -> None:
# Imports deferred to avoid loading modules
@ -143,15 +180,6 @@ async def async_setup_entry( # noqa: C901
notification_id="profile_object_dump",
)
def _get_function_absfile(func: Any) -> str:
"""Get the absolute file path of a function."""
import inspect # pylint: disable=import-outside-toplevel
abs_file = "unknown"
with suppress(Exception):
abs_file = inspect.getabsfile(func)
return abs_file
def _lru_stats(call: ServiceCall) -> None:
"""Log the stats of all lru caches."""
# Imports deferred to avoid loading modules
@ -164,7 +192,7 @@ async def async_setup_entry( # noqa: C901
_LOGGER.critical(
"Cache stats for lru_cache %s at %s: %s",
lru.__wrapped__,
_get_function_absfile(lru.__wrapped__),
_get_function_absfile(lru.__wrapped__) or "unknown",
lru.cache_info(),
)
@ -175,7 +203,7 @@ async def async_setup_entry( # noqa: C901
_LOGGER.critical(
"Cache stats for LRU %s at %s: %s",
type(class_with_lru_attr),
_get_function_absfile(class_with_lru_attr),
_get_function_absfile(class_with_lru_attr) or "unknown",
maybe_lru.get_stats(),
)
@ -267,6 +295,30 @@ async def async_setup_entry( # noqa: C901
_async_stop_log_objects,
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_START_LOG_OBJECT_SOURCES,
_async_start_object_sources,
schema=vol.Schema(
{
vol.Optional(
CONF_SCAN_INTERVAL, default=DEFAULT_SCAN_INTERVAL
): cv.time_period,
vol.Optional(CONF_MAX_OBJECTS, default=DEFAULT_MAX_OBJECTS): vol.Range(
min=1, max=1024
),
}
),
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_STOP_LOG_OBJECT_SOURCES,
_async_stop_object_sources,
)
async_register_admin_service(
hass,
DOMAIN,
@ -404,3 +456,101 @@ def _log_objects(*_):
import objgraph # pylint: disable=import-outside-toplevel
_LOGGER.critical("Memory Growth: %s", objgraph.growth(limit=1000))
def _get_function_absfile(func: Any) -> str | None:
"""Get the absolute file path of a function."""
import inspect # pylint: disable=import-outside-toplevel
abs_file: str | None = None
with suppress(Exception):
abs_file = inspect.getabsfile(func)
return abs_file
def _safe_repr(obj: Any) -> str:
"""Get the repr of an object but keep going if there is an exception.
We wrap repr to ensure if one object cannot be serialized, we can
still get the rest.
"""
try:
return repr(obj)
except Exception: # pylint: disable=broad-except
return f"Failed to serialize {type(obj)}"
def _find_backrefs_not_to_self(_object: Any) -> list[str]:
import objgraph # pylint: disable=import-outside-toplevel
return [
_safe_repr(backref)
for backref in objgraph.find_backref_chain(
_object, lambda obj: obj is not _object
)
]
def _log_object_sources(
max_objects: int, last_ids: set[int], last_stats: dict[str, int]
) -> None:
# Imports deferred to avoid loading modules
# in memory since usually only one part of this
# integration is used at a time
import gc # pylint: disable=import-outside-toplevel
gc.collect()
objects = gc.get_objects()
new_objects: list[object] = []
new_objects_overflow: dict[str, int] = {}
current_ids = set()
new_stats: dict[str, int] = {}
had_new_object_growth = False
try:
for _object in objects:
object_type = type(_object).__name__
new_stats[object_type] = new_stats.get(object_type, 0) + 1
for _object in objects:
id_ = id(_object)
current_ids.add(id_)
if id_ in last_ids:
continue
object_type = type(_object).__name__
if last_stats.get(object_type, 0) < new_stats[object_type]:
if len(new_objects) < max_objects:
new_objects.append(_object)
else:
new_objects_overflow.setdefault(object_type, 0)
new_objects_overflow[object_type] += 1
for _object in new_objects:
had_new_object_growth = True
object_type = type(_object).__name__
_LOGGER.critical(
"New object %s (%s/%s) at %s: %s",
object_type,
last_stats.get(object_type, 0),
new_stats[object_type],
_get_function_absfile(_object) or _find_backrefs_not_to_self(_object),
_safe_repr(_object),
)
for object_type, count in last_stats.items():
new_stats[object_type] = max(new_stats.get(object_type, 0), count)
finally:
# Break reference cycles
del objects
del new_objects
last_ids.clear()
last_ids.update(current_ids)
last_stats.clear()
last_stats.update(new_stats)
del new_stats
del current_ids
if new_objects_overflow:
_LOGGER.critical("New objects overflowed by %s", new_objects_overflow)
elif not had_new_object_growth:
_LOGGER.critical("No new object growth found")

View file

@ -25,7 +25,7 @@ memory:
max: 3600
unit_of_measurement: seconds
start_log_objects:
name: Start log objects
name: Start logging objects
description: Start logging growth of objects in memory
fields:
scan_interval:
@ -38,7 +38,7 @@ start_log_objects:
max: 3600
unit_of_measurement: seconds
stop_log_objects:
name: Stop log objects
name: Stop logging objects
description: Stop logging growth of objects in memory.
dump_log_objects:
name: Dump log objects
@ -51,6 +51,31 @@ dump_log_objects:
example: State
selector:
text:
start_log_object_sources:
name: Start logging object sources
description: Start logging sources of new objects in memory
fields:
scan_interval:
name: Scan interval
description: The number of seconds between logging objects.
default: 30.0
selector:
number:
min: 1
max: 3600
unit_of_measurement: seconds
max_objects:
name: Maximum objects
description: The maximum number of objects to log.
default: 5
selector:
number:
min: 1
max: 30
unit_of_measurement: objects
stop_log_object_sources:
name: Stop logging object sources
description: Stop logging sources of new objects in memory.
lru_stats:
name: Log LRU stats
description: Log the stats of all lru caches.

View file

@ -19,7 +19,9 @@ from homeassistant.components.profiler import (
SERVICE_LRU_STATS,
SERVICE_MEMORY,
SERVICE_START,
SERVICE_START_LOG_OBJECT_SOURCES,
SERVICE_START_LOG_OBJECTS,
SERVICE_STOP_LOG_OBJECT_SOURCES,
SERVICE_STOP_LOG_OBJECTS,
)
from homeassistant.components.profiler.const import DOMAIN
@ -130,13 +132,20 @@ async def test_object_growth_logging(
await hass.services.async_call(
DOMAIN, SERVICE_START_LOG_OBJECTS, {CONF_SCAN_INTERVAL: 10}, blocking=True
)
with pytest.raises(HomeAssistantError, match="Object logging already started"):
await hass.services.async_call(
DOMAIN,
SERVICE_START_LOG_OBJECTS,
{CONF_SCAN_INTERVAL: 10},
blocking=True,
)
assert "Growth" in caplog.text
caplog.clear()
assert "Growth" in caplog.text
caplog.clear()
async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=11))
await hass.async_block_till_done()
assert "Growth" in caplog.text
async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=11))
await hass.async_block_till_done()
assert "Growth" in caplog.text
await hass.services.async_call(DOMAIN, SERVICE_STOP_LOG_OBJECTS, {}, blocking=True)
caplog.clear()
@ -145,6 +154,17 @@ async def test_object_growth_logging(
await hass.async_block_till_done()
assert "Growth" not in caplog.text
with pytest.raises(HomeAssistantError, match="Object logging not running"):
await hass.services.async_call(
DOMAIN, SERVICE_STOP_LOG_OBJECTS, {}, blocking=True
)
with patch("objgraph.growth"):
await hass.services.async_call(
DOMAIN, SERVICE_START_LOG_OBJECTS, {CONF_SCAN_INTERVAL: 10}, blocking=True
)
caplog.clear()
assert await hass.config_entries.async_unload(entry.entry_id)
await hass.async_block_till_done()
@ -276,3 +296,92 @@ async def test_lru_stats(hass: HomeAssistant, caplog: pytest.LogCaptureFixture)
assert "_dummy_test_lru_stats" in caplog.text
assert "CacheInfo" in caplog.text
assert "sqlalchemy_test" in caplog.text
async def test_log_object_sources(
hass: HomeAssistant, caplog: pytest.LogCaptureFixture
) -> None:
"""Test we can setup and the service and we can dump objects to the log."""
entry = MockConfigEntry(domain=DOMAIN)
entry.add_to_hass(hass)
assert await hass.config_entries.async_setup(entry.entry_id)
await hass.async_block_till_done()
assert hass.services.has_service(DOMAIN, SERVICE_START_LOG_OBJECT_SOURCES)
assert hass.services.has_service(DOMAIN, SERVICE_STOP_LOG_OBJECT_SOURCES)
class FakeObject:
"""Fake object."""
def __repr__(self):
"""Return a fake repr.""."""
return "<FakeObject>"
fake_object = FakeObject()
with patch("gc.collect"), patch("gc.get_objects", return_value=[fake_object]):
await hass.services.async_call(
DOMAIN,
SERVICE_START_LOG_OBJECT_SOURCES,
{CONF_SCAN_INTERVAL: 10},
blocking=True,
)
with pytest.raises(HomeAssistantError, match="Object logging already started"):
await hass.services.async_call(
DOMAIN,
SERVICE_START_LOG_OBJECT_SOURCES,
{CONF_SCAN_INTERVAL: 10},
blocking=True,
)
assert "New object FakeObject (0/1)" in caplog.text
caplog.clear()
async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=11))
await hass.async_block_till_done()
assert "No new object growth found" in caplog.text
fake_object2 = FakeObject()
with patch("gc.collect"), patch(
"gc.get_objects", return_value=[fake_object, fake_object2]
):
caplog.clear()
async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=21))
await hass.async_block_till_done()
assert "New object FakeObject (1/2)" in caplog.text
many_objects = [FakeObject() for _ in range(30)]
with patch("gc.collect"), patch("gc.get_objects", return_value=many_objects):
caplog.clear()
async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=31))
await hass.async_block_till_done()
assert "New object FakeObject (2/30)" in caplog.text
assert "New objects overflowed by {'FakeObject': 25}" in caplog.text
await hass.services.async_call(
DOMAIN, SERVICE_STOP_LOG_OBJECT_SOURCES, {}, blocking=True
)
caplog.clear()
async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=41))
await hass.async_block_till_done()
assert "FakeObject" not in caplog.text
assert "No new object growth found" not in caplog.text
assert await hass.config_entries.async_unload(entry.entry_id)
await hass.async_block_till_done()
async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=51))
await hass.async_block_till_done()
assert "FakeObject" not in caplog.text
assert "No new object growth found" not in caplog.text
with pytest.raises(HomeAssistantError, match="Object logging not running"):
await hass.services.async_call(
DOMAIN, SERVICE_STOP_LOG_OBJECT_SOURCES, {}, blocking=True
)