From 51ff027fcefbfd0d08dfdcbf7d3b392cd943a949 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 2 Apr 2023 14:54:21 -1000 Subject: [PATCH] Add object source logger to profiler (#90650) * Add object source logger to profiler * fixes * cleanup * tweaks * logging * logging * too intensive * adjust * Update homeassistant/bootstrap.py * fixes * fixes * coverage --- homeassistant/components/profiler/__init__.py | 196 ++++++++++++++++-- .../components/profiler/services.yaml | 29 ++- tests/components/profiler/test_init.py | 119 ++++++++++- 3 files changed, 314 insertions(+), 30 deletions(-) diff --git a/homeassistant/components/profiler/__init__.py b/homeassistant/components/profiler/__init__.py index 27e302f47c4..95ce69aed4a 100644 --- a/homeassistant/components/profiler/__init__.py +++ b/homeassistant/components/profiler/__init__.py @@ -17,7 +17,7 @@ import voluptuous as vol from homeassistant.components import persistent_notification from homeassistant.config_entries import ConfigEntry from homeassistant.const import CONF_SCAN_INTERVAL, CONF_TYPE -from homeassistant.core import HomeAssistant, ServiceCall +from homeassistant.core import HomeAssistant, ServiceCall, callback from homeassistant.exceptions import HomeAssistantError import homeassistant.helpers.config_validation as cv from homeassistant.helpers.event import async_track_time_interval @@ -29,6 +29,8 @@ SERVICE_START = "start" SERVICE_MEMORY = "memory" SERVICE_START_LOG_OBJECTS = "start_log_objects" SERVICE_STOP_LOG_OBJECTS = "stop_log_objects" +SERVICE_START_LOG_OBJECT_SOURCES = "start_log_object_sources" +SERVICE_STOP_LOG_OBJECT_SOURCES = "stop_log_object_sources" SERVICE_DUMP_LOG_OBJECTS = "dump_log_objects" SERVICE_LRU_STATS = "lru_stats" SERVICE_LOG_THREAD_FRAMES = "log_thread_frames" @@ -60,7 +62,10 @@ SERVICES = ( DEFAULT_SCAN_INTERVAL = timedelta(seconds=30) +DEFAULT_MAX_OBJECTS = 5 + CONF_SECONDS = "seconds" +CONF_MAX_OBJECTS = "max_objects" LOG_INTERVAL_SUB = "log_interval_subscription" @@ -85,7 +90,7 @@ async def async_setup_entry( # noqa: C901 async def _async_start_log_objects(call: ServiceCall) -> None: if LOG_INTERVAL_SUB in domain_data: - domain_data[LOG_INTERVAL_SUB]() + raise HomeAssistantError("Object logging already started") persistent_notification.async_create( hass, @@ -103,21 +108,53 @@ async def async_setup_entry( # noqa: C901 async def _async_stop_log_objects(call: ServiceCall) -> None: if LOG_INTERVAL_SUB not in domain_data: - return + raise HomeAssistantError("Object logging not running") persistent_notification.async_dismiss(hass, "profile_object_logging") domain_data.pop(LOG_INTERVAL_SUB)() - def _safe_repr(obj: Any) -> str: - """Get the repr of an object but keep going if there is an exception. + async def _async_start_object_sources(call: ServiceCall) -> None: + if LOG_INTERVAL_SUB in domain_data: + raise HomeAssistantError("Object logging already started") - We wrap repr to ensure if one object cannot be serialized, we can - still get the rest. - """ - try: - return repr(obj) - except Exception: # pylint: disable=broad-except - return f"Failed to serialize {type(obj)}" + persistent_notification.async_create( + hass, + ( + "Object source logging has started. See [the logs](/config/logs) to" + " track the growth of new objects." + ), + title="Object source logging started", + notification_id="profile_object_source_logging", + ) + + last_ids: set[int] = set() + last_stats: dict[str, int] = {} + + async def _log_object_sources_with_max(*_: Any) -> None: + await hass.async_add_executor_job( + _log_object_sources, call.data[CONF_MAX_OBJECTS], last_ids, last_stats + ) + + await _log_object_sources_with_max() + cancel_track = async_track_time_interval( + hass, _log_object_sources_with_max, call.data[CONF_SCAN_INTERVAL] + ) + + @callback + def _cancel(): + cancel_track() + last_ids.clear() + last_stats.clear() + + domain_data[LOG_INTERVAL_SUB] = _cancel + + @callback + def _async_stop_object_sources(call: ServiceCall) -> None: + if LOG_INTERVAL_SUB not in domain_data: + raise HomeAssistantError("Object logging not running") + + persistent_notification.async_dismiss(hass, "profile_object_source_logging") + domain_data.pop(LOG_INTERVAL_SUB)() def _dump_log_objects(call: ServiceCall) -> None: # Imports deferred to avoid loading modules @@ -143,15 +180,6 @@ async def async_setup_entry( # noqa: C901 notification_id="profile_object_dump", ) - def _get_function_absfile(func: Any) -> str: - """Get the absolute file path of a function.""" - import inspect # pylint: disable=import-outside-toplevel - - abs_file = "unknown" - with suppress(Exception): - abs_file = inspect.getabsfile(func) - return abs_file - def _lru_stats(call: ServiceCall) -> None: """Log the stats of all lru caches.""" # Imports deferred to avoid loading modules @@ -164,7 +192,7 @@ async def async_setup_entry( # noqa: C901 _LOGGER.critical( "Cache stats for lru_cache %s at %s: %s", lru.__wrapped__, - _get_function_absfile(lru.__wrapped__), + _get_function_absfile(lru.__wrapped__) or "unknown", lru.cache_info(), ) @@ -175,7 +203,7 @@ async def async_setup_entry( # noqa: C901 _LOGGER.critical( "Cache stats for LRU %s at %s: %s", type(class_with_lru_attr), - _get_function_absfile(class_with_lru_attr), + _get_function_absfile(class_with_lru_attr) or "unknown", maybe_lru.get_stats(), ) @@ -267,6 +295,30 @@ async def async_setup_entry( # noqa: C901 _async_stop_log_objects, ) + async_register_admin_service( + hass, + DOMAIN, + SERVICE_START_LOG_OBJECT_SOURCES, + _async_start_object_sources, + schema=vol.Schema( + { + vol.Optional( + CONF_SCAN_INTERVAL, default=DEFAULT_SCAN_INTERVAL + ): cv.time_period, + vol.Optional(CONF_MAX_OBJECTS, default=DEFAULT_MAX_OBJECTS): vol.Range( + min=1, max=1024 + ), + } + ), + ) + + async_register_admin_service( + hass, + DOMAIN, + SERVICE_STOP_LOG_OBJECT_SOURCES, + _async_stop_object_sources, + ) + async_register_admin_service( hass, DOMAIN, @@ -404,3 +456,101 @@ def _log_objects(*_): import objgraph # pylint: disable=import-outside-toplevel _LOGGER.critical("Memory Growth: %s", objgraph.growth(limit=1000)) + + +def _get_function_absfile(func: Any) -> str | None: + """Get the absolute file path of a function.""" + import inspect # pylint: disable=import-outside-toplevel + + abs_file: str | None = None + with suppress(Exception): + abs_file = inspect.getabsfile(func) + return abs_file + + +def _safe_repr(obj: Any) -> str: + """Get the repr of an object but keep going if there is an exception. + + We wrap repr to ensure if one object cannot be serialized, we can + still get the rest. + """ + try: + return repr(obj) + except Exception: # pylint: disable=broad-except + return f"Failed to serialize {type(obj)}" + + +def _find_backrefs_not_to_self(_object: Any) -> list[str]: + import objgraph # pylint: disable=import-outside-toplevel + + return [ + _safe_repr(backref) + for backref in objgraph.find_backref_chain( + _object, lambda obj: obj is not _object + ) + ] + + +def _log_object_sources( + max_objects: int, last_ids: set[int], last_stats: dict[str, int] +) -> None: + # Imports deferred to avoid loading modules + # in memory since usually only one part of this + # integration is used at a time + import gc # pylint: disable=import-outside-toplevel + + gc.collect() + + objects = gc.get_objects() + new_objects: list[object] = [] + new_objects_overflow: dict[str, int] = {} + current_ids = set() + new_stats: dict[str, int] = {} + had_new_object_growth = False + try: + for _object in objects: + object_type = type(_object).__name__ + new_stats[object_type] = new_stats.get(object_type, 0) + 1 + + for _object in objects: + id_ = id(_object) + current_ids.add(id_) + if id_ in last_ids: + continue + object_type = type(_object).__name__ + if last_stats.get(object_type, 0) < new_stats[object_type]: + if len(new_objects) < max_objects: + new_objects.append(_object) + else: + new_objects_overflow.setdefault(object_type, 0) + new_objects_overflow[object_type] += 1 + + for _object in new_objects: + had_new_object_growth = True + object_type = type(_object).__name__ + _LOGGER.critical( + "New object %s (%s/%s) at %s: %s", + object_type, + last_stats.get(object_type, 0), + new_stats[object_type], + _get_function_absfile(_object) or _find_backrefs_not_to_self(_object), + _safe_repr(_object), + ) + + for object_type, count in last_stats.items(): + new_stats[object_type] = max(new_stats.get(object_type, 0), count) + finally: + # Break reference cycles + del objects + del new_objects + last_ids.clear() + last_ids.update(current_ids) + last_stats.clear() + last_stats.update(new_stats) + del new_stats + del current_ids + + if new_objects_overflow: + _LOGGER.critical("New objects overflowed by %s", new_objects_overflow) + elif not had_new_object_growth: + _LOGGER.critical("No new object growth found") diff --git a/homeassistant/components/profiler/services.yaml b/homeassistant/components/profiler/services.yaml index 1105842891f..3bd6d7636ac 100644 --- a/homeassistant/components/profiler/services.yaml +++ b/homeassistant/components/profiler/services.yaml @@ -25,7 +25,7 @@ memory: max: 3600 unit_of_measurement: seconds start_log_objects: - name: Start log objects + name: Start logging objects description: Start logging growth of objects in memory fields: scan_interval: @@ -38,7 +38,7 @@ start_log_objects: max: 3600 unit_of_measurement: seconds stop_log_objects: - name: Stop log objects + name: Stop logging objects description: Stop logging growth of objects in memory. dump_log_objects: name: Dump log objects @@ -51,6 +51,31 @@ dump_log_objects: example: State selector: text: +start_log_object_sources: + name: Start logging object sources + description: Start logging sources of new objects in memory + fields: + scan_interval: + name: Scan interval + description: The number of seconds between logging objects. + default: 30.0 + selector: + number: + min: 1 + max: 3600 + unit_of_measurement: seconds + max_objects: + name: Maximum objects + description: The maximum number of objects to log. + default: 5 + selector: + number: + min: 1 + max: 30 + unit_of_measurement: objects +stop_log_object_sources: + name: Stop logging object sources + description: Stop logging sources of new objects in memory. lru_stats: name: Log LRU stats description: Log the stats of all lru caches. diff --git a/tests/components/profiler/test_init.py b/tests/components/profiler/test_init.py index 9466660dca4..0cafa9ed7aa 100644 --- a/tests/components/profiler/test_init.py +++ b/tests/components/profiler/test_init.py @@ -19,7 +19,9 @@ from homeassistant.components.profiler import ( SERVICE_LRU_STATS, SERVICE_MEMORY, SERVICE_START, + SERVICE_START_LOG_OBJECT_SOURCES, SERVICE_START_LOG_OBJECTS, + SERVICE_STOP_LOG_OBJECT_SOURCES, SERVICE_STOP_LOG_OBJECTS, ) from homeassistant.components.profiler.const import DOMAIN @@ -130,13 +132,20 @@ async def test_object_growth_logging( await hass.services.async_call( DOMAIN, SERVICE_START_LOG_OBJECTS, {CONF_SCAN_INTERVAL: 10}, blocking=True ) + with pytest.raises(HomeAssistantError, match="Object logging already started"): + await hass.services.async_call( + DOMAIN, + SERVICE_START_LOG_OBJECTS, + {CONF_SCAN_INTERVAL: 10}, + blocking=True, + ) - assert "Growth" in caplog.text - caplog.clear() + assert "Growth" in caplog.text + caplog.clear() - async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=11)) - await hass.async_block_till_done() - assert "Growth" in caplog.text + async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=11)) + await hass.async_block_till_done() + assert "Growth" in caplog.text await hass.services.async_call(DOMAIN, SERVICE_STOP_LOG_OBJECTS, {}, blocking=True) caplog.clear() @@ -145,6 +154,17 @@ async def test_object_growth_logging( await hass.async_block_till_done() assert "Growth" not in caplog.text + with pytest.raises(HomeAssistantError, match="Object logging not running"): + await hass.services.async_call( + DOMAIN, SERVICE_STOP_LOG_OBJECTS, {}, blocking=True + ) + + with patch("objgraph.growth"): + await hass.services.async_call( + DOMAIN, SERVICE_START_LOG_OBJECTS, {CONF_SCAN_INTERVAL: 10}, blocking=True + ) + caplog.clear() + assert await hass.config_entries.async_unload(entry.entry_id) await hass.async_block_till_done() @@ -276,3 +296,92 @@ async def test_lru_stats(hass: HomeAssistant, caplog: pytest.LogCaptureFixture) assert "_dummy_test_lru_stats" in caplog.text assert "CacheInfo" in caplog.text assert "sqlalchemy_test" in caplog.text + + +async def test_log_object_sources( + hass: HomeAssistant, caplog: pytest.LogCaptureFixture +) -> None: + """Test we can setup and the service and we can dump objects to the log.""" + + entry = MockConfigEntry(domain=DOMAIN) + entry.add_to_hass(hass) + + assert await hass.config_entries.async_setup(entry.entry_id) + await hass.async_block_till_done() + + assert hass.services.has_service(DOMAIN, SERVICE_START_LOG_OBJECT_SOURCES) + assert hass.services.has_service(DOMAIN, SERVICE_STOP_LOG_OBJECT_SOURCES) + + class FakeObject: + """Fake object.""" + + def __repr__(self): + """Return a fake repr."".""" + return "" + + fake_object = FakeObject() + + with patch("gc.collect"), patch("gc.get_objects", return_value=[fake_object]): + await hass.services.async_call( + DOMAIN, + SERVICE_START_LOG_OBJECT_SOURCES, + {CONF_SCAN_INTERVAL: 10}, + blocking=True, + ) + with pytest.raises(HomeAssistantError, match="Object logging already started"): + await hass.services.async_call( + DOMAIN, + SERVICE_START_LOG_OBJECT_SOURCES, + {CONF_SCAN_INTERVAL: 10}, + blocking=True, + ) + + assert "New object FakeObject (0/1)" in caplog.text + caplog.clear() + + async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=11)) + await hass.async_block_till_done() + assert "No new object growth found" in caplog.text + + fake_object2 = FakeObject() + + with patch("gc.collect"), patch( + "gc.get_objects", return_value=[fake_object, fake_object2] + ): + caplog.clear() + + async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=21)) + await hass.async_block_till_done() + assert "New object FakeObject (1/2)" in caplog.text + + many_objects = [FakeObject() for _ in range(30)] + with patch("gc.collect"), patch("gc.get_objects", return_value=many_objects): + caplog.clear() + + async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=31)) + await hass.async_block_till_done() + assert "New object FakeObject (2/30)" in caplog.text + assert "New objects overflowed by {'FakeObject': 25}" in caplog.text + + await hass.services.async_call( + DOMAIN, SERVICE_STOP_LOG_OBJECT_SOURCES, {}, blocking=True + ) + caplog.clear() + + async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=41)) + await hass.async_block_till_done() + assert "FakeObject" not in caplog.text + assert "No new object growth found" not in caplog.text + + assert await hass.config_entries.async_unload(entry.entry_id) + await hass.async_block_till_done() + + async_fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=51)) + await hass.async_block_till_done() + assert "FakeObject" not in caplog.text + assert "No new object growth found" not in caplog.text + + with pytest.raises(HomeAssistantError, match="Object logging not running"): + await hass.services.async_call( + DOMAIN, SERVICE_STOP_LOG_OBJECT_SOURCES, {}, blocking=True + )