From 9a47116c586ecb922697d1b5ff449d9525d181d7 Mon Sep 17 00:00:00 2001 From: Todor Ivanov Date: Tue, 25 Jul 2023 19:13:02 +0200 Subject: [PATCH] Switch to psutils for resource utilization monitoring at runtime. --- .../WMRuntime/Monitors/PerformanceMonitor.py | 47 +++++++------------ 1 file changed, 16 insertions(+), 31 deletions(-) diff --git a/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py b/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py index 4508786a7a..34204e2823 100644 --- a/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py +++ b/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py @@ -14,6 +14,7 @@ import os.path import signal import time +import psutils import WMCore.Algorithms.SubprocessAlgos as subprocessAlgos import WMCore.FwkJobReport.Report as Report @@ -84,8 +85,6 @@ def __init__(self): self.pid = None self.uid = os.getuid() - self.monitorBase = "ps -p %i -o pid,ppid,rss,pcpu,pmem,cmd -ww | grep %i" - self.pssMemoryCommand = "awk '/^Pss/ {pss += $2} END {print pss}' /proc/%i/smaps" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None @@ -208,38 +207,24 @@ def periodicUpdate(self): # Then we have no step PID, we can do nothing return - # Now we run the ps monitor command and collate the data - # Gathers RSS, %CPU and %MEM statistics from ps - ps_cmd = self.monitorBase % (stepPID, stepPID) - stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd) - - ps_output = stdout.split() - if not len(ps_output) > 6: - # Then something went wrong in getting the ps data - msg = "Error when grabbing output from process ps\n" - msg += "output = %s\n" % ps_output - msg += "command = %s\n" % ps_cmd - logging.error(msg) - return - - # run the command to gather PSS memory statistics from /proc//smaps - smaps_cmd = self.pssMemoryCommand % (stepPID) - stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd) - - smaps_output = stdout.split() - if not len(smaps_output) == 1: - # Then something went wrong in getting the smaps data - msg = "Error when grabbing output from smaps\n" - msg += "output = %s\n" % smaps_output - msg += "command = %s\n" % smaps_cmd - logging.error(msg) + # Now we run the psutil module and collate some process data. + # We gather info about CPU system time and Memory statistics - VMS, PSS, RSS + stepProc = psutils.Process(stepPID) + if not stepProc.is_running(): + # The step process has vanished before we managed to finish the current periodic update + msg = " The step process with PID: %s of STEP: %s " + msg += "has finished execution before the last periodic resource update.\n" + logging.warning(msg, stepPid, self.currentStepName) return - # smaps also returns data in kiloBytes, let's make it megaBytes - # I'm also confused with these megabytes and mebibytes... - pss = int(smaps_output[0]) // 1000 + stepCmd = stepProc.cmdline() + stepMemInfo = stepProc.memory_full_info() + stepCpuInfo = stepProc.cpu_times() + # NOTE: All the information from psutil.memory_*info() comes in Bytes + # we need to make it in MegaBytes + pss = int(stepMemInfo.pss) // (1000**2) - logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4]) + logging.info("PID: %s; VirtMEM: %s, PSS: %s; RSS: %s; SystemTime: %s;", stepPID, stepMemInfo.vms, stepMemInfo.pss, stepMemInfo.rss, stepCpuInfo.system) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores