dmwm · vkuznet · Sep 30, 2024 · Oct 14, 2024 · amaltaro · Oct 19, 2024
diff --git a/src/python/WMComponent/WorkflowUpdater/SiteListUpdater.py b/src/python/WMComponent/WorkflowUpdater/SiteListUpdater.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+"""
+File       : SiteListUpdater
+Author     : Valentin Kuznetsov <vkuznet AT gmail dot com>
+Description: module to update of site lists within a WMAgent
+"""
+
+# system modules
+import os
+import json
+import shutil
+import logging
+import threading
+
+# WMCore modules
+from Utils.CertTools import ckey, cert
+from Utils.Timers import timeFunction
+from WMCore.Agent.Harness import Harness
+from WMCore.DAOFactory import DAOFactory
+from WMCore.Services.pycurl_manager import getdata
+from WMCore.Services.WorkQueue.WorkQueue import WorkQueue
+from WMCore.WMException import WMException
+from WMCore.WMSpec.WMWorkload import WMWorkloadHelper
+from WMCore.WorkerThreads.BaseWorkerThread import BaseWorkerThread
+
+
+class SiteListUpdaterPoller(BaseWorkerThread):
+    def __init__(self, config):
+        """
+        Initialize SiteListUpdaterPoller object
+        :param config: a Configuration object with the component configuration
+        """
+        BaseWorkerThread.__init__(self)
+        myThread = threading.currentThread()
+        self.logger = myThread.logger
+
+        # the reqmgr2Url should be points to ReqMgr2 data services, i.e. /reqmgr2 end-point
+        self.wmstatsUrl = getattr(config.SiteListUpdater, "wmstatsUrl")
+
+        # provide access to WMBS in local WMAgent
+        self.daoFactory = DAOFactory(package="WMCore.WMBS",
+                                     logger=myThread.logger,
+                                     dbinterface=myThread.dbi)
+        # DB function to retrieve active workflows
+        self.listActiveWflows = self.daoFactory(classname="Workflow.GetUnfinishedWorkflows")
+
+        # local WorkQueue service
+        self.localCouchUrl = self.config.WorkQueueManager.couchurl
+        self.localWQ = WorkQueue(self.localCouchUrl,
+                                 self.config.WorkQueueManager.dbname)
+
+    def getActiveWorkflows(self):
+        """
+        Provide list of active requests within WMAgent
+        :return: dict of workflows names vs pickle files
+        """
+        # get list of active workflows in WMAgent
+        wflowSpecs = self.listActiveWflows.execute()
+
+        # construct dictionary of workflow names and their pickle files
+        wmaSpecs = {}
+        for wflowSpec in wflowSpecs:
+            name = wflowSpec['name']  # this is the name of workflow
+            pklFileName = wflowSpec['spec']  # the "spec" in WMBS table (wmbs_workflow.spec) is pkl file name
+            wmaSpecs[name] = pklFileName
+        return wmaSpecs
+
+    def getRequestSpecs(self, requests):
+        """
+        Return list of requests specs for provided list of request names
+        :param requests: list of workflow requests names
+        :return wsList: list of workflow records obtained from wmstats server, each record has the following structure
+        {"RequestName": "bla", "SiteWhitelist":[], "SiteBlacklist": []}
+        """
+        # get list of workflows from wmstats
+        states = ['running-open', 'acquired']
+        urls = []
+        for state in states:
+            url = "{}/data/filtered_requests?RequestStatus={}&mask=SiteWhitelist&mask=SiteBlacklist".format(self.wmstatsUrl, state)
+            urls.append(url)
+        response = getdata(urls, ckey(), cert())
+        wsList = []
+        for resp in response:
+            data = json.loads(resp['data'])
+            for rdict in data['result']:
+                # rdict here has the following structure: list of records where each record is
+                # {"RequestName": "bla", "SiteWhitelist":[], "SiteBlacklist": []}
+                wflow = rdict['RequestName']
+                # check that our workflow is in our requests list
+                if wflow in requests:
+                    wsList.append(rdict)
+        return wsList
+
+    @timeFunction
+    def algorithm(self, parameters=None):
+        """
+        Perform the following logic:
+        - obtain list of current active workflows from the agent
+        - requests their specs from upstream ReqMgr2 server
+        - update site lists of all workflows
+        - push new specs to the agent local WorkQueue and update pickle spec file
+
+        :return: none
+        """
+        # get list of active workflows from the agent, the returned dict
+        # is composed by workflow names and associated pickle file (data comes from WMBS)
+        wmaSpecs = self.getActiveWorkflows()
+        wflows = wmaSpecs.keys()
+
+        # obtain workflow records from wmstats server
+        wsList = self.getRequestData(wflows)
+
+        # iterate over workflow items and update local WorkQueue and pickle files if
+        # either site white or black lists are different
+        for rdict in wsList:
+            wflow = rdict['RequestName']
+            siteWhiteList = rdict['SiteWhitelist']
+            siteBlackList = rdict['SiteBlacklist']
+
+            # get the name of pkl file from wma spec
+            pklFileName = wmaSpecs[wflow]
+
+            # create wrapper helper and load pickle file
+            wHelper = WMWorkloadHelper()
+            wHelper.load(pklFileName)
+
+            # extract from pickle spec both white and black site lists and compare them
+            # to one we received from upstream service (ReqMgr2)
+            wmaWhiteList = wHelper.getSiteWhiteList()
+            wmaBlackList = wHelper.getSiteBlackList()
+            if set(wmaWhiteList) != set(siteWhiteList) or set(wmaBlackList) != set(siteBlackList):
+                self.logger.info(f"Updating {wflow}: siteWhiteList {wmaWhiteList} => {siteWhiteList} and siteBlackList {wmaBlackList} => {siteBlackList}")
+                try:
+                    # update local WorkQueue first
+                    self.localWQ.updateSiteLists(wflow, siteWhiteList, siteBlackList)
+                except Exception as ex:
+                    msg = f"Caught unexpected exception in SiteListUpdater. Details:\n{str(ex)}"
+                    logging.exception(msg)
+                    continue
+
+                # update workload only if we updated local WorkQueue
+                # update site white/black lists together
+                if set(wmaWhiteList) != set(siteWhiteList):
+                    wHelper.setWhitelist(siteWhiteList)
+                if set(wmaBlackList) != set(siteBlackList):
+                    wHelper.setBlacklist(siteBlackList)
+
+                try:
+                    # persist the spec in local CouchDB
+                    self.logger.info(f"Updating {self.localCouchUrl} with new site lists for {wflow}")
+                    wHelper.saveCouchUrl(self.localCouchUrl)
+
+                    # save back pickle file
+                    newPklFileName = pklFileName.split('.pkl')[0] + '_new.pkl'
+                    wHelper.save(newPklFileName)
+
+                    # if new pickle file is saved we can swap it with original one
+                    if os.path.getsize(newPklFileName) > 0:
+                        self.logger.info(f"Updated {pklFileName}")
+                        shutil.move(newPklFileName, pklFileName)
+                except Exception as ex:
+                    msg = f"Caught unexpected exception in SiteListUpdater. Details:\n{str(ex)}"
+                    logging.exception(msg)
+                    continue
diff --git a/src/python/WMComponent/WorkflowUpdater/WorkflowUpdater.py b/src/python/WMComponent/WorkflowUpdater/WorkflowUpdater.py
@@ -35,3 +35,7 @@ def preInitialization(self):
         myThread = threading.currentThread()
         myThread.workerThreadManager.addWorker(WorkflowUpdaterPoller(self.config),
                                                pollInterval)
+
+        myThread = threading.currentThread()
+        myThread.workerThreadManager.addWorker(SiteListUpdaterPoller(self.config),
+                                               pollInterval)
diff --git a/src/python/WMCore/Services/WorkQueue/WorkQueue.py b/src/python/WMCore/Services/WorkQueue/WorkQueue.py
@@ -237,6 +237,33 @@ def cancelWorkflow(self, wf):
         elements = [x['id'] for x in data.get('rows', []) if x['key'][1] not in nonCancelableElements]
         return self.updateElements(*elements, Status='CancelRequested')
 
+    def updateSiteLists(self, wf, siteWhiteList=None, siteBlackList=None):
+        """
+        Update site lists of a workflow
+
+        :param wf: workflow name
+        :param siteWhiteList: new site white list (optional)
+        :param siteBlackList: new site black list (optional)
+        :return: None
+        """
+        # Update elements in Available status
+        data = self.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus',
+                                {'startkey': [wf], 'endkey': [wf, {}],
+                                 'reduce': False})
+        elementsToUpdate = [x['id'] for x in data.get('rows', [])]
+        if elementsToUpdate:
+            self.updateElements(*elementsToUpdate, SiteWhiteList=siteWhiteList, SiteBlackList=siteBlackList)
     def updateElementsByWorkflow(self, wf, updateParams, status=None): 
         """ 
         Update all available WorkQueue elements of a given workflow  with a set 
         of arguments provided through the `updateParams` dictionary 
         :param wf:           The workflow name 
         :param updateParams: A dictionary with parameters  to be updated 
         :param status:       A list of allowed WorkQueue elements statuses to be considered for updating 
                              Default: None - do not filter by status 
         :return:             No value, raises exceptions from internal methods in case of errors. 
         """ 
         # Fetch the whole view with Workqueue elements per given workflow 
         data = self.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', 
                                 {'startkey': [wf], 'endkey': [wf, {}], 
                                  'reduce': False}) 
         # Fetch only a list of WorkQueue element Ids && Filter them by allowed status 
         if status: 
             elementsToUpdate = [x['id'] for x in data.get('rows', []) if x['value']['Status'] in status] 
         else: 
             elementsToUpdate = [x['id'] for x in data.get('rows', [])] 
         # Update all WorkQueue elements with the parameters provided in a single push 
         if elementsToUpdate: 
             self.updateElements(*elementsToUpdate, **updateParams) 
         # Update the spec, if it exists 
         if self.db.documentExists(wf): 
             wmspec = WMWorkloadHelper() 
             wmspec.load(self.hostWithAuth + "/%s/%s/spec" % (self.db.name, wf)) 
             wmspec.updateWorkloadArgs(updateParams) 
             dummy_values = {'name': wmspec.name()} 
             wmspec.saveCouch(self.hostWithAuth, self.db.name, dummy_values) 
         return 
     def updateWorkloadArgs(self, reqArgs): 
         """ 
         Method to take a dictionary of arguments of the type: 
         {reqArg1: value, 
          reqArg2: value, 
          ...} 
         and update the workload by a predefined map of reqArg to setter methods. 
         :param reqArgs: A Dictionary of request arguments to be updated 
         :return:        Nothing, Raises an error of type WMWorkloadException if 
                         fails to apply the proper setter method 
         """ 
         # NOTE: So far we support only a single argument setter methods, like 
         #       setSiteWhitelist or setPriority. This may change in the future, 
         #       but it will require a change in the logic of how we validate and 
         #       call the proper setter methods bellow. 
         # populate the current instance settersMap 
         self.settersMap['RequestPriority'] = setterTuple('RequestPriority', self.setPriority, inspect.signature(self.setPriority)) 
         self.settersMap['SiteBlacklist'] = setterTuple('SiteBlacklist', self.setSiteBlacklist, inspect.signature(self.setSiteBlacklist)) 
         self.settersMap['SiteWhitelist'] = setterTuple('SiteWhitelist', self.setSiteWhitelist, inspect.signature(self.setSiteWhitelist)) 
         # First validate if we can properly call the setter function given the reqArgs passed. 
         for reqArg, argValue in reqArgs.items(): 
             if not self.settersMap.get(reqArg, None): 
                 msg = f"Unsupported or missing setter method for updating reqArg: {reqArg}." 
                 raise WMWorkloadException(msg) 
             try: 
                 self.settersMap[reqArg].setterSignature.bind(argValue) 
             except TypeError as ex: 
                 msg = f"Setter's method signature does not match the method calls we currently support: Error: req{str(ex)}" 
                 raise WMWorkloadException(msg) from None 
         # Now go through the reqArg again and call every setter method according to the map 
         for reqArg, argValue in reqArgs.items(): 
             try: 
                 self.settersMap[reqArg].setterFunc(argValue) 
             except Exception as ex: 
                 currFrame = inspect.currentframe() 
                 argsInfo = inspect.getargvalues(currFrame) 
                 argVals = {arg: argsInfo.locals.get(arg) for arg in argsInfo.args} 
                 msg = f"Failure while calling setter method {self.settersMap[reqArg].setterFunc.__name__} " 
                 msg += f"With arguments: {argVals}" 
                 msg += f"Full exception string: {str(ex)}" 
                 raise WMWorkloadException(msg) from None 
     def updateElementsByWorkflow(self, wf, updateParams, status=None): 
         """ 
         Update all available WorkQueue elements of a given workflow  with a set 
         of arguments provided through the `updateParams` dictionary 
         :param wf:           The workflow name 
         :param updateParams: A dictionary with parameters  to be updated 
         :param status:       A list of allowed WorkQueue elements statuses to be considered for updating 
                              Default: None - do not filter by status 
         :return:             No value, raises exceptions from internal methods in case of errors. 
         """ 
         # Fetch the whole view with Workqueue elements per given workflow 
         data = self.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', 
                                 {'startkey': [wf], 'endkey': [wf, {}], 
                                  'reduce': False}) 
  
         # Fetch only a list of WorkQueue element Ids && Filter them by allowed status 
         if status: 
             elementsToUpdate = [x['id'] for x in data.get('rows', []) if x['value']['Status'] in status] 
         else: 
             elementsToUpdate = [x['id'] for x in data.get('rows', [])] 
  
         # Update all WorkQueue elements with the parameters provided in a single push 
         if elementsToUpdate: 
             self.updateElements(*elementsToUpdate, **updateParams) 
  
         # Update the spec, if it exists 
         if self.db.documentExists(wf): 
             wmspec = WMWorkloadHelper() 
             wmspec.load(self.hostWithAuth + "/%s/%s/spec" % (self.db.name, wf)) 
             wmspec.updateWorkloadArgs(updateParams) 
             dummy_values = {'name': wmspec.name()} 
             wmspec.saveCouch(self.hostWithAuth, self.db.name, dummy_values) 
         return 
     def updateWorkloadArgs(self, reqArgs): 
         """ 
         Method to take a dictionary of arguments of the type: 
         {reqArg1: value, 
          reqArg2: value, 
          ...} 
         and update the workload by a predefined map of reqArg to setter methods. 
         :param reqArgs: A Dictionary of request arguments to be updated 
         :return:        Nothing, Raises an error of type WMWorkloadException if 
                         fails to apply the proper setter method 
         """ 
         # NOTE: So far we support only a single argument setter methods, like 
         #       setSiteWhitelist or setPriority. This may change in the future, 
         #       but it will require a change in the logic of how we validate and 
         #       call the proper setter methods bellow. 
  
         # populate the current instance settersMap 
         self.settersMap['RequestPriority'] = setterTuple('RequestPriority', self.setPriority, inspect.signature(self.setPriority)) 
         self.settersMap['SiteBlacklist'] = setterTuple('SiteBlacklist', self.setSiteBlacklist, inspect.signature(self.setSiteBlacklist)) 
         self.settersMap['SiteWhitelist'] = setterTuple('SiteWhitelist', self.setSiteWhitelist, inspect.signature(self.setSiteWhitelist)) 
  
         # First validate if we can properly call the setter function given the reqArgs passed. 
         for reqArg, argValue in reqArgs.items(): 
             if not self.settersMap.get(reqArg, None): 
                 msg = f"Unsupported or missing setter method for updating reqArg: {reqArg}." 
                 raise WMWorkloadException(msg) 
             try: 
                 self.settersMap[reqArg].setterSignature.bind(argValue) 
             except TypeError as ex: 
                 msg = f"Setter's method signature does not match the method calls we currently support: Error: req{str(ex)}" 
                 raise WMWorkloadException(msg) from None 
  
         # Now go through the reqArg again and call every setter method according to the map 
         for reqArg, argValue in reqArgs.items(): 
             try: 
                 self.settersMap[reqArg].setterFunc(argValue) 
             except Exception as ex: 
                 currFrame = inspect.currentframe() 
                 argsInfo = inspect.getargvalues(currFrame) 
                 argVals = {arg: argsInfo.locals.get(arg) for arg in argsInfo.args} 
                 msg = f"Failure while calling setter method {self.settersMap[reqArg].setterFunc.__name__} " 
                 msg += f"With arguments: {argVals}" 
                 msg += f"Full exception string: {str(ex)}" 
                 raise WMWorkloadException(msg) from None 
+        # Update the spec, if it exists
+        if self.db.documentExists(wf):
+            wmspec = WMWorkloadHelper()
+            # update local workqueue couchDB
+            wmspec.load(self.hostWithAuth + "/%s/%s/spec" % (self.db.name, wf))
+            wmspec.setSiteWhiteList(siteWhiteList)
+            wmspec.setSiteBlackList(siteBlackList)
+            dummy_values = {'name': wmspec.name()}
+            wmspec.saveCouch(self.hostWithAuth, self.db.name, dummy_values)
+        return
+
     def updatePriority(self, wf, priority):
         """Update priority of a workflow, this implies
            updating the spec and the priority of the Available elements"""

diff --git a/src/python/WMCore/WMSpec/WMWorkload.py b/src/python/WMCore/WMSpec/WMWorkload.py
@@ -673,6 +673,15 @@ def removeTask(self, taskName):
         self.data.tasks.tasklist.remove(taskName)
         return
 
+    def getSiteWhitelist(self):
+        """
+        Get the site white list from our data
+        :return: site white list
+        """
+        if getattr(self.data, 'SiteWhiteList', None):
+            return getattr(self.data, "SiteWhiteList")
+        return []
+
     def setSiteWhitelist(self, siteWhitelist):
         """
         _setSiteWhitelist_
@@ -689,6 +698,15 @@ def setSiteWhitelist(self, siteWhitelist):
 
         return
 
+    def getSiteBlacklist(self):
+        """
+        Get the site black list from our data
+        :return: site black list
+        """
+        if getattr(self.data, 'SiteBlackList', None):
+            return getattr(self.data, "SiteBlackList")
+        return []
+
     def setSiteBlacklist(self, siteBlacklist):
         """
         _setSiteBlacklist_

diff --git a/test/python/WMCore_t/WMSpec_t/WMWorkload_t.py b/test/python/WMCore_t/WMSpec_t/WMWorkload_t.py
@@ -336,6 +336,37 @@ def testDbsUrl(self):
         self.assertEqual(url, "https://cmsweb-prod.cern.ch/dbs/prod/global/DBSReader")
         return
 
+    def testGetSiteWhitelist(self):
+        """
+        Teats getSiteWhitelist and getSiteBlackList functionality of the task.
+        """
+        testWorkload = WMWorkloadHelper(WMWorkload("TestWorkload"))
+
+        procTestTask = testWorkload.newTask("ProcessingTask")
+        procTestTaskCMSSW = procTestTask.makeStep("cmsRun1")
+        procTestTaskCMSSW.setStepType("CMSSW")
+
+        procTestTask.addInputDataset(name="/PrimaryDataset/ProcessedDataset/DATATIER",
+                                     primary="PrimaryDataset",
+                                     processed="ProcessedDataset",
+                                     tier="DATATIER",
+                                     block_whitelist=["Block1", "Block2"],
+                                     black_blacklist=["Block3"],
+                                     run_whitelist=[1, 2],
+                                     run_blacklist=[3])
+
+        newSiteWhiteList = ["T1_US_FNAL", "T0_CH_CERN"]
+        newSiteBlackList = ["T1_DE_KIT"]
+        testWorkload.setSiteWhitelist(newSiteWhiteList)
+        testWorkload.setSiteBlacklist([newSiteBlackList)
+
+        siteWhiteList = procTestTask.getSiteWhitelist()
+        siteBlackList = procTestTask.getSiteWhitelist()
+        self.assertTrue(set(newSiteWhiteList) == set(siteWhiteList),
+                        "Error: Site white list mismatch")
+        self.assertTrue(set(newSiteBlackList) == set(siteBlackList),
+                        "Error: Site black list mismatch")
+
     def testWhiteBlacklists(self):
         """
         _testWhiteBlacklists_