Skip to content

Commit

Permalink
Support CGroup v2 on Supervised with manual restarts (#5419)
Browse files Browse the repository at this point in the history
  • Loading branch information
mdegat01 authored Dec 9, 2024
1 parent 1f89311 commit 829193f
Show file tree
Hide file tree
Showing 8 changed files with 341 additions and 11 deletions.
15 changes: 15 additions & 0 deletions supervisor/addons/addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ def __init__(self, coresys: CoreSys, slug: str):
self._boot_failed_issue = Issue(
IssueType.BOOT_FAIL, ContextType.ADDON, reference=self.slug
)
self._device_access_missing_issue = Issue(
IssueType.DEVICE_ACCESS_MISSING, ContextType.ADDON, reference=self.slug
)

def __repr__(self) -> str:
"""Return internal representation."""
Expand All @@ -158,6 +161,11 @@ def boot_failed_issue(self) -> Issue:
"""Get issue used if start on boot failed."""
return self._boot_failed_issue

@property
def device_access_missing_issue(self) -> Issue:
"""Get issue used if device access is missing and can't be automatically added."""
return self._device_access_missing_issue

@property
def state(self) -> AddonState:
"""Return state of the add-on."""
Expand All @@ -182,6 +190,13 @@ def state(self, new_state: AddonState) -> None:
):
self.sys_resolution.dismiss_issue(self.boot_failed_issue)

# Dismiss device access missing issue if present and we stopped
if (
new_state == AddonState.STOPPED
and self.device_access_missing_issue in self.sys_resolution.issues
):
self.sys_resolution.dismiss_issue(self.device_access_missing_issue)

self.sys_homeassistant.websocket.send_message(
{
ATTR_TYPE: WSType.SUPERVISOR_EVENT,
Expand Down
20 changes: 19 additions & 1 deletion supervisor/docker/addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pathlib import Path
from typing import TYPE_CHECKING

from attr import evolve
from awesomeversion import AwesomeVersion
import docker
from docker.types import Mount
Expand Down Expand Up @@ -40,7 +41,7 @@
from ..hardware.data import Device
from ..jobs.const import JobCondition, JobExecutionLimit
from ..jobs.decorator import Job
from ..resolution.const import ContextType, IssueType, SuggestionType
from ..resolution.const import CGROUP_V2_VERSION, ContextType, IssueType, SuggestionType
from ..utils.sentry import capture_exception
from .const import (
ENV_TIME,
Expand Down Expand Up @@ -802,6 +803,13 @@ async def stop(self, remove_container: bool = True) -> None:

await super().stop(remove_container)

# If there is a device access issue and the container is removed, clear it
if (
remove_container
and self.addon.device_access_missing_issue in self.sys_resolution.issues
):
self.sys_resolution.dismiss_issue(self.addon.device_access_missing_issue)

async def _validate_trust(
self, image_id: str, image: str, version: AwesomeVersion
) -> None:
Expand Down Expand Up @@ -839,6 +847,16 @@ async def _hardware_events(self, device: Device) -> None:
f"Can't process Hardware Event on {self.name}: {err!s}", _LOGGER.error
) from err

if (
self.sys_docker.info.cgroup == CGROUP_V2_VERSION
and not self.sys_os.available
):
self.sys_resolution.add_issue(
evolve(self.addon.device_access_missing_issue),
suggestions=[SuggestionType.EXECUTE_RESTART],
)
return

permission = self.sys_hardware.policy.get_cgroups_rule(device)
try:
await self.sys_dbus.agent.cgroup.add_devices_allowed(
Expand Down
5 changes: 5 additions & 0 deletions supervisor/resolution/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
DNS_CHECK_HOST = "_checkdns.home-assistant.io"
DNS_ERROR_NO_DATA = 1

CGROUP_V1_VERSION = "1"
CGROUP_V2_VERSION = "2"


class ContextType(StrEnum):
"""Place where somethings was happening."""
Expand Down Expand Up @@ -77,6 +80,7 @@ class IssueType(StrEnum):
CORRUPT_FILESYSTEM = "corrupt_filesystem"
DETACHED_ADDON_MISSING = "detached_addon_missing"
DETACHED_ADDON_REMOVED = "detached_addon_removed"
DEVICE_ACCESS_MISSING = "device_access_missing"
DISABLED_DATA_DISK = "disabled_data_disk"
DNS_LOOP = "dns_loop"
DNS_SERVER_FAILED = "dns_server_failed"
Expand Down Expand Up @@ -112,6 +116,7 @@ class SuggestionType(StrEnum):
EXECUTE_REMOVE = "execute_remove"
EXECUTE_REPAIR = "execute_repair"
EXECUTE_RESET = "execute_reset"
EXECUTE_RESTART = "execute_restart"
EXECUTE_START = "execute_start"
EXECUTE_STOP = "execute_stop"
EXECUTE_UPDATE = "execute_update"
Expand Down
9 changes: 2 additions & 7 deletions supervisor/resolution/evaluations/cgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,9 @@

from ...const import CoreState
from ...coresys import CoreSys
from ..const import UnsupportedReason
from ..const import CGROUP_V1_VERSION, CGROUP_V2_VERSION, UnsupportedReason
from .base import EvaluateBase

CGROUP_V1_VERSION = "1"
CGROUP_V2_VERSION = "2"


def setup(coresys: CoreSys) -> EvaluateBase:
"""Initialize evaluation-setup function."""
Expand All @@ -20,9 +17,7 @@ class EvaluateCGroupVersion(EvaluateBase):
@property
def expected_versions(self) -> set[str]:
"""Return expected cgroup versions."""
if self.coresys.os.available:
return {CGROUP_V1_VERSION, CGROUP_V2_VERSION}
return {CGROUP_V1_VERSION}
return {CGROUP_V1_VERSION, CGROUP_V2_VERSION}

@property
def reason(self) -> UnsupportedReason:
Expand Down
60 changes: 60 additions & 0 deletions supervisor/resolution/fixups/addon_execute_restart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Helpers to fix addon by restarting it."""

import logging

from ...coresys import CoreSys
from ...exceptions import AddonsError, ResolutionFixupError
from ..const import ContextType, IssueType, SuggestionType
from .base import FixupBase

_LOGGER: logging.Logger = logging.getLogger(__name__)


def setup(coresys: CoreSys) -> FixupBase:
"""Check setup function."""
return FixupAddonExecuteRestart(coresys)


class FixupAddonExecuteRestart(FixupBase):
"""Storage class for fixup."""

async def process_fixup(self, reference: str | None = None) -> None:
"""Initialize the fixup class."""
if not (addon := self.sys_addons.get(reference, local_only=True)):
_LOGGER.info("Cannot restart addon %s as it does not exist", reference)
return

# Stop addon
try:
await addon.stop()
except AddonsError as err:
_LOGGER.error("Could not stop %s due to %s", reference, err)
raise ResolutionFixupError() from None

# Start addon
# Removing the container has already fixed the issue and dismissed it
# So any errors on startup are just logged. We won't wait on the startup task either
try:
await addon.start()
except AddonsError as err:
_LOGGER.error("Could not restart %s due to %s", reference, err)

@property
def suggestion(self) -> SuggestionType:
"""Return a SuggestionType enum."""
return SuggestionType.EXECUTE_RESTART

@property
def context(self) -> ContextType:
"""Return a ContextType enum."""
return ContextType.ADDON

@property
def issues(self) -> list[IssueType]:
"""Return a IssueType enum list."""
return [IssueType.DEVICE_ACCESS_MISSING]

@property
def auto(self) -> bool:
"""Return if a fixup can be apply as auto fix."""
return False
121 changes: 119 additions & 2 deletions tests/docker/test_addon.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Test docker addon setup."""

import asyncio
from ipaddress import IPv4Address
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock, Mock, PropertyMock, patch

Expand All @@ -12,12 +14,17 @@
from supervisor.addons.addon import Addon
from supervisor.addons.model import Data
from supervisor.addons.options import AddonOptions
from supervisor.const import BusEvent
from supervisor.coresys import CoreSys
from supervisor.dbus.agent.cgroup import CGroup
from supervisor.docker.addon import DockerAddon
from supervisor.docker.manager import DockerAPI
from supervisor.exceptions import CoreDNSError, DockerNotFound
from supervisor.hardware.data import Device
from supervisor.os.manager import OSManager
from supervisor.plugins.dns import PluginDns
from supervisor.resolution.const import ContextType, IssueType
from supervisor.resolution.data import Issue
from supervisor.resolution.const import ContextType, IssueType, SuggestionType
from supervisor.resolution.data import Issue, Suggestion

from ..common import load_json_fixture
from . import DEV_MOUNT
Expand Down Expand Up @@ -380,3 +387,113 @@ async def test_addon_stop_delete_host_error(
await docker_addon.stop()

capture_exception.assert_called_once_with(err)


TEST_DEV_PATH = "/dev/ttyAMA0"
TEST_SYSFS_PATH = "/sys/devices/platform/soc/ffe09000.usb/ff500000.usb/xhci-hcd.0.auto/usb1/1-1/1-1.1/1-1.1:1.0/tty/ttyACM0"
TEST_HW_DEVICE = Device(
name="ttyACM0",
path=Path("/dev/ttyAMA0"),
sysfs=Path(
"/sys/devices/platform/soc/ffe09000.usb/ff500000.usb/xhci-hcd.0.auto/usb1/1-1/1-1.1/1-1.1:1.0/tty/ttyACM0"
),
subsystem="tty",
parent=Path(
"/sys/devices/platform/soc/ffe09000.usb/ff500000.usb/xhci-hcd.0.auto/usb1/1-1/1-1.1/1-1.1:1.0"
),
links=[
Path(
"/dev/serial/by-id/usb-Texas_Instruments_TI_CC2531_USB_CDC___0X0123456789ABCDEF-if00"
),
Path("/dev/serial/by-path/platform-xhci-hcd.0.auto-usb-0:1.1:1.0"),
Path("/dev/serial/by-path/platform-xhci-hcd.0.auto-usbv2-0:1.1:1.0"),
],
attributes={},
children=[],
)


@pytest.mark.usefixtures("path_extern")
@pytest.mark.parametrize(
("dev_path", "cgroup", "is_os"),
[
(TEST_DEV_PATH, "1", True),
(TEST_SYSFS_PATH, "1", True),
(TEST_DEV_PATH, "1", False),
(TEST_SYSFS_PATH, "1", False),
(TEST_DEV_PATH, "2", True),
(TEST_SYSFS_PATH, "2", True),
],
)
async def test_addon_new_device(
coresys: CoreSys,
install_addon_ssh: Addon,
container: MagicMock,
docker: DockerAPI,
dev_path: str,
cgroup: str,
is_os: bool,
):
"""Test new device that is listed in static devices."""
coresys.hardware.disk.get_disk_free_space = lambda x: 5000
install_addon_ssh.data["devices"] = [dev_path]
container.id = 123
docker.info.cgroup = cgroup

with (
patch.object(Addon, "write_options"),
patch.object(OSManager, "available", new=PropertyMock(return_value=is_os)),
patch.object(CGroup, "add_devices_allowed") as add_devices,
):
await install_addon_ssh.start()

coresys.bus.fire_event(
BusEvent.HARDWARE_NEW_DEVICE,
TEST_HW_DEVICE,
)
await asyncio.sleep(0.01)

add_devices.assert_called_once_with(123, "c 0:0 rwm")


@pytest.mark.usefixtures("path_extern")
@pytest.mark.parametrize("dev_path", [TEST_DEV_PATH, TEST_SYSFS_PATH])
async def test_addon_new_device_no_haos(
coresys: CoreSys,
install_addon_ssh: Addon,
docker: DockerAPI,
dev_path: str,
):
"""Test new device that is listed in static devices on non HAOS system with CGroup V2."""
coresys.hardware.disk.get_disk_free_space = lambda x: 5000
install_addon_ssh.data["devices"] = [dev_path]
docker.info.cgroup = "2"

with (
patch.object(Addon, "write_options"),
patch.object(OSManager, "available", new=PropertyMock(return_value=False)),
patch.object(CGroup, "add_devices_allowed") as add_devices,
):
await install_addon_ssh.start()

coresys.bus.fire_event(
BusEvent.HARDWARE_NEW_DEVICE,
TEST_HW_DEVICE,
)
await asyncio.sleep(0.01)

add_devices.assert_not_called()

# Issue added with hardware event since access cannot be added dynamically
assert install_addon_ssh.device_access_missing_issue in coresys.resolution.issues
assert (
Suggestion(
SuggestionType.EXECUTE_RESTART, ContextType.ADDON, reference="local_ssh"
)
in coresys.resolution.suggestions
)

# Stopping and removing the container clears it as access granted on next start
await install_addon_ssh.stop()
assert coresys.resolution.issues == []
assert coresys.resolution.suggestions == []
2 changes: 1 addition & 1 deletion tests/resolution/evaluation/test_evaluate_cgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ async def test_evaluation(coresys: CoreSys):

coresys.docker.info.cgroup = CGROUP_V2_VERSION
await cgroup_version()
assert cgroup_version.reason in coresys.resolution.unsupported
assert cgroup_version.reason not in coresys.resolution.unsupported
coresys.resolution.unsupported.clear()

coresys.docker.info.cgroup = CGROUP_V1_VERSION
Expand Down
Loading

0 comments on commit 829193f

Please sign in to comment.