From 8fe737cbbcf0a5d0e1343c2c0d09ef59189ac1b8 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Sun, 28 Jul 2024 23:03:03 +0100 Subject: [PATCH 1/7] Unclearly exit if terminating takes too long This could happen if someone catches `BaseException`, or due to another issue with cleanup. Also add some specific test cases to catch the specific cases, which can be added to over time. --- sbot/timeout.py | 15 ++++++++++-- .../timeout_scripts/catch-base-exception.py | 10 ++++++++ .../timeout_scripts/catch-exception.py | 10 ++++++++ tests/test_data/timeout_scripts/hot-loop.py | 7 ++++++ tests/test_data/timeout_scripts/sleep.py | 6 +++++ .../test_data/timeout_scripts/try-finally.py | 11 +++++++++ tests/test_timeout.py | 24 ++++++++++++++----- 7 files changed, 75 insertions(+), 8 deletions(-) create mode 100644 tests/test_data/timeout_scripts/catch-base-exception.py create mode 100644 tests/test_data/timeout_scripts/catch-exception.py create mode 100644 tests/test_data/timeout_scripts/hot-loop.py create mode 100644 tests/test_data/timeout_scripts/sleep.py create mode 100644 tests/test_data/timeout_scripts/try-finally.py diff --git a/sbot/timeout.py b/sbot/timeout.py index f30d22a2..7f3701c5 100644 --- a/sbot/timeout.py +++ b/sbot/timeout.py @@ -9,6 +9,8 @@ logger = logging.getLogger(__name__) +TIMEOUT_MESSAGE = "Timeout expired: Game Over!" + def timeout_handler(signal_type: int, stack_frame: Optional[FrameType]) -> None: """ @@ -17,12 +19,21 @@ def timeout_handler(signal_type: int, stack_frame: Optional[FrameType]) -> None: This function is called when the timeout expires and will stop the robot's main process. In order for this to work, any threads that are created must be daemons. + If the process doesn't terminate clearly (perhaps because the exception was caught), + exit less cleanly. + NOTE: This function is not called on Windows. :param signal_type: The sginal that triggered this handler :param stack_frame: The stack frame at the time of the signal """ - logger.info("Timeout expired: Game Over!") + logger.info(TIMEOUT_MESSAGE) + + # If the process doesn't terminate in a given time, exit less cleanly + signal.signal(signal.SIGALRM, signal.SIG_DFL) + signal.alarm(2) + + # Exit cleanly exit(0) @@ -35,7 +46,7 @@ def win_timeout_handler() -> None: NOTE: This function is only called on Windows. """ - logger.info("Timeout expired: Game Over!") + logger.info(TIMEOUT_MESSAGE) os.kill(os.getpid(), signal.SIGTERM) diff --git a/tests/test_data/timeout_scripts/catch-base-exception.py b/tests/test_data/timeout_scripts/catch-base-exception.py new file mode 100644 index 00000000..c0c45ffc --- /dev/null +++ b/tests/test_data/timeout_scripts/catch-base-exception.py @@ -0,0 +1,10 @@ +from sbot.timeout import kill_after_delay +import time + +kill_after_delay(2) + +while True: + try: + time.sleep(10) + except BaseException: + pass diff --git a/tests/test_data/timeout_scripts/catch-exception.py b/tests/test_data/timeout_scripts/catch-exception.py new file mode 100644 index 00000000..18473194 --- /dev/null +++ b/tests/test_data/timeout_scripts/catch-exception.py @@ -0,0 +1,10 @@ +from sbot.timeout import kill_after_delay +import time + +kill_after_delay(2) + +while True: + try: + time.sleep(10) + except Exception: + pass diff --git a/tests/test_data/timeout_scripts/hot-loop.py b/tests/test_data/timeout_scripts/hot-loop.py new file mode 100644 index 00000000..df70783a --- /dev/null +++ b/tests/test_data/timeout_scripts/hot-loop.py @@ -0,0 +1,7 @@ +from sbot.timeout import kill_after_delay + +kill_after_delay(2) + +# This is a bad idea +while True: + pass diff --git a/tests/test_data/timeout_scripts/sleep.py b/tests/test_data/timeout_scripts/sleep.py new file mode 100644 index 00000000..3e9dbd70 --- /dev/null +++ b/tests/test_data/timeout_scripts/sleep.py @@ -0,0 +1,6 @@ +from sbot.timeout import kill_after_delay +import time + +kill_after_delay(2) + +time.sleep(10) diff --git a/tests/test_data/timeout_scripts/try-finally.py b/tests/test_data/timeout_scripts/try-finally.py new file mode 100644 index 00000000..e173a6e8 --- /dev/null +++ b/tests/test_data/timeout_scripts/try-finally.py @@ -0,0 +1,11 @@ +from sbot.timeout import kill_after_delay +import time + +kill_after_delay(2) + +while True: + try: + time.sleep(10) + finally: + # Imagine we wanted to do something else here + pass diff --git a/tests/test_timeout.py b/tests/test_timeout.py index 8dbb5872..c723ab20 100644 --- a/tests/test_timeout.py +++ b/tests/test_timeout.py @@ -2,6 +2,7 @@ from time import sleep, time import os +from pathlib import Path import signal import subprocess import sys @@ -10,6 +11,8 @@ from sbot.timeout import kill_after_delay +TEST_FILES = list((Path(__file__).parent / 'test_data/timeout_scripts').iterdir()) + @pytest.mark.skipif(sys.platform == "win32", reason="does not run on Windows") def test_kill_after_delay() -> None: """Test that the process is killed within the time.""" @@ -17,6 +20,9 @@ def test_kill_after_delay() -> None: kill_after_delay(2) sleep(3) + # Clear the set alarm + signal.alarm(0) + @pytest.mark.skipif(sys.platform != "win32", reason="only runs on Windows") def test_kill_after_delay_windows(monkeypatch) -> None: @@ -34,20 +40,26 @@ def test_kill_after_delay_windows(monkeypatch) -> None: kill.assert_called_once_with(pid, signal.SIGTERM) -def test_kill_after_delay_e2e() -> None: +@pytest.mark.parametrize( + "test_file", + [str(f) for f in TEST_FILES], + ids=[f.name for f in TEST_FILES] +) +def test_kill_after_delay_e2e(test_file: Path) -> None: start_time = time() child = subprocess.Popen([ sys.executable, - "-c", - 'from sbot.timeout import kill_after_delay; import time; kill_after_delay(2); time.sleep(10)', + str(test_file), ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - child.wait(timeout=5) + child.wait(timeout=6) + run_time = time() - start_time - assert time() - start_time == pytest.approx(2, rel=1) + assert 2 < run_time < 6 if sys.platform == "win32": # Windows terminates uncleanly assert child.returncode == signal.SIGTERM else: - assert child.returncode == 0 + # Either the process was killed cleanly, or the fallback did + assert child.returncode in [0, -signal.SIGALRM] From 4d424467768f63e20bc703faf37422e8df2c1794 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Sun, 28 Jul 2024 23:03:03 +0100 Subject: [PATCH 2/7] Ensure timeout doesn't hold process running --- tests/test_data/timeout_scripts/early-exit.py | 3 +++ tests/test_timeout.py | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 tests/test_data/timeout_scripts/early-exit.py diff --git a/tests/test_data/timeout_scripts/early-exit.py b/tests/test_data/timeout_scripts/early-exit.py new file mode 100644 index 00000000..830b1c02 --- /dev/null +++ b/tests/test_data/timeout_scripts/early-exit.py @@ -0,0 +1,3 @@ +from sbot.timeout import kill_after_delay + +kill_after_delay(2) diff --git a/tests/test_timeout.py b/tests/test_timeout.py index c723ab20..4d02d827 100644 --- a/tests/test_timeout.py +++ b/tests/test_timeout.py @@ -42,7 +42,7 @@ def test_kill_after_delay_windows(monkeypatch) -> None: @pytest.mark.parametrize( "test_file", - [str(f) for f in TEST_FILES], + TEST_FILES, ids=[f.name for f in TEST_FILES] ) def test_kill_after_delay_e2e(test_file: Path) -> None: @@ -55,7 +55,10 @@ def test_kill_after_delay_e2e(test_file: Path) -> None: child.wait(timeout=6) run_time = time() - start_time - assert 2 < run_time < 6 + assert run_time < 6 + + if test_file.name != "early-exit.py": + assert run_time > 2 if sys.platform == "win32": # Windows terminates uncleanly From b9af79040504d30da6f207149f63769b36707186 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Sun, 28 Jul 2024 23:03:03 +0100 Subject: [PATCH 3/7] Add condition around sigalarm fallback Needed to make mypy happy --- sbot/timeout.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sbot/timeout.py b/sbot/timeout.py index 7f3701c5..ca53285f 100644 --- a/sbot/timeout.py +++ b/sbot/timeout.py @@ -29,9 +29,11 @@ def timeout_handler(signal_type: int, stack_frame: Optional[FrameType]) -> None: """ logger.info(TIMEOUT_MESSAGE) - # If the process doesn't terminate in a given time, exit less cleanly - signal.signal(signal.SIGALRM, signal.SIG_DFL) - signal.alarm(2) + # This condition is needed to keep MyPy happy + if sys.platform != "win32": + # If the process doesn't terminate in a given time, exit less cleanly + signal.signal(signal.SIGALRM, signal.SIG_DFL) + signal.alarm(2) # Exit cleanly exit(0) From 709ca485e6530cdd5f0e6432c53d26bafe0de57a Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Sun, 28 Jul 2024 23:03:03 +0100 Subject: [PATCH 4/7] Separate custom tests out of main test case --- .../early-exit.py | 0 .../timeout_scripts_extra/exception.py | 5 +++ tests/test_timeout.py | 38 ++++++++++++++++--- 3 files changed, 38 insertions(+), 5 deletions(-) rename tests/test_data/{timeout_scripts => timeout_scripts_extra}/early-exit.py (100%) create mode 100644 tests/test_data/timeout_scripts_extra/exception.py diff --git a/tests/test_data/timeout_scripts/early-exit.py b/tests/test_data/timeout_scripts_extra/early-exit.py similarity index 100% rename from tests/test_data/timeout_scripts/early-exit.py rename to tests/test_data/timeout_scripts_extra/early-exit.py diff --git a/tests/test_data/timeout_scripts_extra/exception.py b/tests/test_data/timeout_scripts_extra/exception.py new file mode 100644 index 00000000..4f2529e1 --- /dev/null +++ b/tests/test_data/timeout_scripts_extra/exception.py @@ -0,0 +1,5 @@ +from sbot.timeout import kill_after_delay + +kill_after_delay(2) + +1 / 0 diff --git a/tests/test_timeout.py b/tests/test_timeout.py index 4d02d827..d186fcbe 100644 --- a/tests/test_timeout.py +++ b/tests/test_timeout.py @@ -12,6 +12,7 @@ from sbot.timeout import kill_after_delay TEST_FILES = list((Path(__file__).parent / 'test_data/timeout_scripts').iterdir()) +EXTRA_TEST_FILES_DIR = Path(__file__).parent / 'test_data/timeout_scripts_extra' @pytest.mark.skipif(sys.platform == "win32", reason="does not run on Windows") def test_kill_after_delay() -> None: @@ -43,7 +44,7 @@ def test_kill_after_delay_windows(monkeypatch) -> None: @pytest.mark.parametrize( "test_file", TEST_FILES, - ids=[f.name for f in TEST_FILES] + ids=[f.stem for f in TEST_FILES] ) def test_kill_after_delay_e2e(test_file: Path) -> None: start_time = time() @@ -55,10 +56,7 @@ def test_kill_after_delay_e2e(test_file: Path) -> None: child.wait(timeout=6) run_time = time() - start_time - assert run_time < 6 - - if test_file.name != "early-exit.py": - assert run_time > 2 + assert 2 < run_time < 6 if sys.platform == "win32": # Windows terminates uncleanly @@ -66,3 +64,33 @@ def test_kill_after_delay_e2e(test_file: Path) -> None: else: # Either the process was killed cleanly, or the fallback did assert child.returncode in [0, -signal.SIGALRM] + +def test_early_exit() -> None: + start_time = time() + child = subprocess.Popen([ + sys.executable, + str(EXTRA_TEST_FILES_DIR / "early-exit.py"), + ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + child.wait(timeout=6) + + run_time = time() - start_time + + assert run_time < 2 + + assert child.returncode == 0 + +def test_exception() -> None: + start_time = time() + child = subprocess.Popen([ + sys.executable, + str(EXTRA_TEST_FILES_DIR / "exception.py"), + ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + child.wait(timeout=6) + + run_time = time() - start_time + + assert run_time < 2 + + assert child.returncode == 1 From d6c3af8560705c09d5e947aa5ca81b7264a9f153 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Sun, 28 Jul 2024 23:03:03 +0100 Subject: [PATCH 5/7] Only allow SIGALRM if the execution took too long --- tests/test_timeout.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_timeout.py b/tests/test_timeout.py index d186fcbe..dc153775 100644 --- a/tests/test_timeout.py +++ b/tests/test_timeout.py @@ -61,9 +61,12 @@ def test_kill_after_delay_e2e(test_file: Path) -> None: if sys.platform == "win32": # Windows terminates uncleanly assert child.returncode == signal.SIGTERM + elif run_time < 4: + # If the process terminated quickly, it should be successful + assert child.returncode == 0 else: - # Either the process was killed cleanly, or the fallback did - assert child.returncode in [0, -signal.SIGALRM] + # If the process took too long, it was killed ungracefully + assert child.returncode == -signal.SIGALRM def test_early_exit() -> None: start_time = time() From fc2f41c6cd7fe30e38b6432acdc1860f215b4366 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Sun, 28 Jul 2024 23:03:03 +0100 Subject: [PATCH 6/7] Run windows timeout Timer as daemon This ensures it doesn't block the running process --- sbot/timeout.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sbot/timeout.py b/sbot/timeout.py index ca53285f..aa0a9333 100644 --- a/sbot/timeout.py +++ b/sbot/timeout.py @@ -65,6 +65,7 @@ def kill_after_delay(timeout_seconds: int) -> None: # Windows doesn't have SIGALRM, # so we approximate its functionality using a delayed SIGTERM timer = Timer(timeout_seconds, win_timeout_handler) + timer.daemon = True timer.start() else: signal.signal(signal.SIGALRM, timeout_handler) From 2731a572eeaa3a2243fdde4cbdd5617e97e898f4 Mon Sep 17 00:00:00 2001 From: WillB97 Date: Sun, 28 Jul 2024 23:03:03 +0100 Subject: [PATCH 7/7] Use a second loop of SIGALRM to run cleanup functions before crashing out --- sbot/timeout.py | 26 +++++++++++++++++++------- tests/test_timeout.py | 4 ++-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/sbot/timeout.py b/sbot/timeout.py index aa0a9333..da11a369 100644 --- a/sbot/timeout.py +++ b/sbot/timeout.py @@ -1,4 +1,5 @@ """Functions for killing the robot after a certain amount of time.""" +import atexit import logging import os import signal @@ -10,6 +11,7 @@ logger = logging.getLogger(__name__) TIMEOUT_MESSAGE = "Timeout expired: Game Over!" +EXIT_ATTEMPTS = 0 def timeout_handler(signal_type: int, stack_frame: Optional[FrameType]) -> None: @@ -27,16 +29,26 @@ def timeout_handler(signal_type: int, stack_frame: Optional[FrameType]) -> None: :param signal_type: The sginal that triggered this handler :param stack_frame: The stack frame at the time of the signal """ - logger.info(TIMEOUT_MESSAGE) + global EXIT_ATTEMPTS + EXIT_ATTEMPTS += 1 - # This condition is needed to keep MyPy happy - if sys.platform != "win32": - # If the process doesn't terminate in a given time, exit less cleanly - signal.signal(signal.SIGALRM, signal.SIG_DFL) + if sys.platform == "win32": + raise AssertionError("This function should not be called on Windows") + + if EXIT_ATTEMPTS == 1: + # Allow 2 seconds for the process to exit cleanly before killing it signal.alarm(2) + logger.info(TIMEOUT_MESSAGE) + # Exit cleanly + exit(0) + else: + # The process didn't exit cleanly, so first call the cleanup handlers + # and use an unhanded alarm to force python to exit with a core dump + signal.signal(signal.SIGALRM, signal.SIG_DFL) + signal.alarm(2) # Allow 2 seconds for cleanup - # Exit cleanly - exit(0) + atexit._run_exitfuncs() + exit(0) def win_timeout_handler() -> None: diff --git a/tests/test_timeout.py b/tests/test_timeout.py index dc153775..5a95b989 100644 --- a/tests/test_timeout.py +++ b/tests/test_timeout.py @@ -53,10 +53,10 @@ def test_kill_after_delay_e2e(test_file: Path) -> None: str(test_file), ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - child.wait(timeout=6) + child.wait(timeout=8) run_time = time() - start_time - assert 2 < run_time < 6 + assert 2 < run_time < 8 if sys.platform == "win32": # Windows terminates uncleanly