# @package      hubzero-submit-common
# @file         RemoteJobMonitor.py
# @author       Steven Clark <clarks@purdue.edu>
# @copyright    Copyright (c) 2012-2013 HUBzero Foundation, LLC.
# @license      http://www.gnu.org/licenses/lgpl-3.0.html LGPLv3
#
# Copyright (c) 2012-2013 HUBzero Foundation, LLC.
#
# This file is part of: The HUBzero(R) Platform for Scientific Collaboration
#
# The HUBzero(R) Platform for Scientific Collaboration (HUBzero) is free
# software: you can redistribute it and/or modify it under the terms of
# the GNU Lesser General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# HUBzero is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# HUBzero is a registered trademark of HUBzero Foundation, LLC.
#

import sys
import time
import traceback
import csv
import os
from errno import EPIPE

from hubzero.submit.LogMessage  import logID as log
from hubzero.submit.MessageCore import MessageCore
from hubzero.submit.JobOutput   import *

class RemoteJobMonitor(MessageCore):
   def __init__(self,
                host,
                port,
                repeatDelay=5,
                fixedBufferSize=128):
      MessageCore.__init__(self,listenerHost=host,listenerPort=port,repeatDelay=repeatDelay)
      self.fixedBufferSize = fixedBufferSize
      self.enteredCommand  = None
      self.startDate       = None
      self.finishDate      = None

      self.jobStatus = {}
      self.jobStatus['WF'] = {'state': 'waiting', 'message': 'Pending Submission'}
      self.jobStatus['RM'] = {'state': 'waiting', 'message': 'Removed'}
      self.jobStatus['DF'] = {'state': 'waiting', 'message': 'Deferred'}
      self.jobStatus['HS'] = {'state': 'waiting', 'message': 'User & System Hold'}
      self.jobStatus['RP'] = {'state': 'waiting', 'message': 'Remove Pending'}
      self.jobStatus['TX'] = {'state': 'waiting', 'message': 'Terminated'}
      self.jobStatus['NF'] = {'state': 'waiting', 'message': 'Node_Fail'}
      self.jobStatus['TO'] = {'state': 'waiting', 'message': 'Timeout'}
      self.jobStatus['PD'] = {'state': 'waiting', 'message': 'Pending'}
      self.jobStatus['NQ'] = {'state': 'waiting', 'message': 'Not Queued'}
      self.jobStatus['NR'] = {'state': 'waiting', 'message': 'Not Run'}
      self.jobStatus['RJ'] = {'state': 'waiting', 'message': 'Rejected'}
      self.jobStatus['CK'] = {'state': 'executing', 'message': 'Checkpointing'}
      self.jobStatus['C']  = {'state': 'executing', 'message': 'Complete'}
      self.jobStatus['E']  = {'state': 'executing', 'message': 'Exiting'}
      self.jobStatus['D']  = {'state': 'finished', 'message': 'Done'}
      self.jobStatus['F']  = {'state': 'waiting', 'message': 'Failed'}
      self.jobStatus['I']  = {'state': 'waiting', 'message': 'Idle'}
      self.jobStatus['H']  = {'state': 'waiting', 'message': 'Held'}
      self.jobStatus['CA'] = {'state': 'waiting', 'message': 'Cancelled'}
      self.jobStatus['CG'] = {'state': 'executing', 'message': 'Completing'}
      self.jobStatus['CF'] = {'state': 'waiting', 'message': 'Configuring'}
      self.jobStatus['N']  = {'state': 'waiting', 'message': 'Submitted'}
      self.jobStatus['Q']  = {'state': 'waiting', 'message': 'Queued'}
      self.jobStatus['P']  = {'state': 'waiting', 'message': 'Pending'}
      self.jobStatus['S']  = {'state': 'waiting', 'message': 'Suspended'}
      self.jobStatus['R']  = {'state': 'executing', 'message': 'Running'}
      self.jobStatus['T']  = {'state': 'waiting', 'message': 'Moving'}
      self.jobStatus['W']  = {'state': 'waiting', 'message': 'Waiting'}
      self.jobStatus['V']  = {'state': 'waiting', 'message': 'Vacated'}
      self.jobStatus['X']  = {'state': 'aborted', 'message': 'Marked For Deletion'}
      self.jobStatus['XP'] = {'state': 'waiting', 'message': 'Reject Pending'}
      self.jobStatus['CP'] = {'state': 'executing', 'message': 'Complete Pending'}
      self.jobStatus['EP'] = {'state': 'waiting', 'message': 'Preempt Pending'}
      self.jobStatus['VP'] = {'state': 'waiting', 'message': 'Vacate Pending'}
      self.jobStatus['PT'] = {'state': 'waiting', 'message': 'Preempted'}
      self.jobStatus['ST'] = {'state': 'waiting', 'message': 'Starting'}
      self.jobStatus['CD'] = {'state': 'executing', 'message': 'Completed'}
      self.jobStatus['SH'] = {'state': 'waiting', 'message': 'System Hold'}
      self.jobStatus['MP'] = {'state': 'waiting', 'message': 'Resume Pending'}
      self.jobStatus['SE'] = {'state': 'failed', 'message': 'Submission Error'}
      self.jobStatus['EF'] = {'state': 'failed', 'message': 'Execution Failed'}
      self.jobStatus['d']  = {'state': 'aborted', 'message': 'Marked For Deletion'}
      self.jobStatus['e']  = {'state': 'aborted', 'message': 'Submission Error'}
      self.jobStatus['r']  = {'state': 'executing', 'message': 'Running'}
      self.jobStatus['t']  = {'state': 'waiting', 'message': 'Transferring'}
      self.jobStatus['w']  = {'state': 'waiting', 'message': 'Waiting'}

      self.jobStatusReportOrders = {}
      self.jobStatusReportOrders['waiting']    = 5
      self.jobStatusReportOrders['aborted']    = 1
      self.jobStatusReportOrders['setting up'] = 6
      self.jobStatusReportOrders['failed']     = 3
      self.jobStatusReportOrders['executing']  = 4
      self.jobStatusReportOrders['finished']   = 2


   @staticmethod
   def __writeToStdout(message):
      try:
         sys.stdout.write(message)
         sys.stdout.flush()
      except IOError,err:
         if not err[0] in [EPIPE]:
            log("Can't write to stdout: %s" % (message))


   def postJobSubmission(self,
                         siteMonitorDesignator,
                         remoteJobId,
                         hubUserId,
                         enteredCommand,
                         localJobId,
                         instanceId,
                         destination,
                         runName,
                         nCores,
                         distributorPid):
      self.enteredCommand = enteredCommand
      self.startDate      = time.strftime("%a %b %e %X %Z %Y")
      attributes = ['S:' + siteMonitorDesignator,remoteJobId,hubUserId,localJobId,instanceId, \
                                                 destination,runName,nCores,distributorPid]
      queryMessage = ' '.join([str(attribute) for attribute in attributes])
      nTry,response = self.requestMessageResponse(queryMessage,
                                                  self.fixedBufferSize,
                                                  self.fixedBufferSize)

      log("confirmation: S(%d):%s" % (nTry,response.strip()))


   def postPegasusWorkflowSubmission(self,
                                     siteMonitorDesignator,
                                     remoteJobId,
                                     hubUserId,
                                     enteredCommand,
                                     localJobId,
                                     nInstances,
                                     destination,
                                     runName,
                                     nCores,
                                     distributorPid):
      self.enteredCommand = enteredCommand
      self.startDate      = time.strftime("%a %b %e %X %Z %Y")
      workflowId = ';'.join(('WF',localJobId))
      attributes = ['S:' + siteMonitorDesignator,remoteJobId,hubUserId,workflowId,nInstances, \
                                                 destination,runName,nCores,distributorPid]
      queryMessage = ' '.join([str(attribute) for attribute in attributes])
      nTry,response = self.requestMessageResponse(queryMessage,
                                                  self.fixedBufferSize,
                                                  self.fixedBufferSize)

      log("confirmation: S(%d):%s" % (nTry,response.strip()))


   def queryRemoteJobStatus(self,
                            siteMonitorDesignator,
                            remoteJobId):
      queryMessage = 'Q:' + siteMonitorDesignator + " " + remoteJobId
      nTry,response = self.requestMessageResponse(queryMessage,
                                                  self.fixedBufferSize,
                                                  self.fixedBufferSize)

      if nTry > 1:
         log("confirmation: Q(%d):%s" % (nTry,response.strip()))

      jobStatus,jobStage = response.strip().split()
      if ';' in jobStage:
         jobStage,jobSite = jobStage.split(';')
      else:
         jobSite = '?'
      if jobStage == '?':
         jobStage = "Job"

      return(jobStatus,jobStage,jobSite)


   def queryPegasusWorkflowStatus(self,
                                  siteMonitorDesignator,
                                  remoteJobId,
                                  nInstances):
      queryMessage = 'W:' + siteMonitorDesignator + " " + remoteJobId + " " + str(nInstances)
      nTry,response = self.requestMessageVariableResponse(queryMessage,
                                                          self.fixedBufferSize,
                                                          self.fixedBufferSize)
      response = response.split(':')

      if nTry > 1:
         log("confirmation: W(%d):%s" % (nTry,response[0].strip()))

      dagStatus,dagStage = response[0].strip().split()
      if ';' in dagStage:
         dagStage,dagSite = dagStage.split(';')
      else:
         dagSite = '?'
      if dagStage == '?':
         dagStage = "DAG"
      del response[0]

      wfInstances = {}
      for wfInstance in response:
         instance,jobStatus,jobStage = wfInstance.strip().split()
         wfInstances[int(instance)] = {}
         wfInstances[int(instance)]['jobStatus'] = jobStatus
         wfInstances[int(instance)]['jobStage']  = jobStage

      return(dagStatus,dagStage,dagSite,wfInstances)


   def terminateRemoteJob(self,
                          siteMonitorDesignator,
                          remoteJobId):
      queryMessage = 'T:' + siteMonitorDesignator + " " + remoteJobId
      nTry,response = self.requestMessageResponse(queryMessage,
                                                  self.fixedBufferSize,
                                                  self.fixedBufferSize)

      log("confirmation: T(%d):%s" % (nTry,response.strip()))


   def queryRemoteActiveJobStatus(self,
                                  siteMonitorDesignator,
                                  remoteJobId):
      queryMessage = 'R:' + siteMonitorDesignator + " " + remoteJobId
      nTry,report,lastReportTime = self.requestMessageTimestampResponse(queryMessage,
                                                                        self.fixedBufferSize,
                                                                        self.fixedBufferSize)

      if nTry > 1:
         log("confirmation: R(%d):%s" % (nTry,lastReportTime.strip()))

      return(float(lastReportTime),report)


   def queryUsersActivity(self,
                          hubUserId):
      queryMessage = 'A:' + str(hubUserId)
      nTry,report,reportTime = self.requestMessageTimestampResponse(queryMessage,
                                                                    self.fixedBufferSize,
                                                                    self.fixedBufferSize)

      if nTry > 1:
         log("confirmation: A(%d):%s" % (nTry,reportTime.strip()))

      reportedActivities = {}
      if len(report) > 0:
         userActivities = report.split(':')
         for userActivity in userActivities:
            hubUserId,activity = userActivity.split()
            reportedActivities[int(hubUserId)] = float(activity)

      return(float(reportTime),reportedActivities)


   def queryUserActiveJobStatus(self,
                                hubUserId):
      queryMessage = 'U:' + str(hubUserId)
      nTry,report,reportTime = self.requestMessageTimestampResponse(queryMessage,
                                                                    self.fixedBufferSize,
                                                                    self.fixedBufferSize)

      if nTry > 1:
         log("confirmation: U(%d):%s" % (nTry,reportTime.strip()))

      reportedJobs = {}
      if len(report) > 0:
         jobs = report.split(':')
         for job in jobs:
            localJobId,instanceId,runName,jobQueue,site,jobStatus,jobStage = job.split()
            if ';' in jobStage:
               jobStage,jobSite = jobStage.split(';')
            else:
               jobSite = site
            jobStatusMessage = self.__getJobStatusMessage(jobStatus)
            if jobStage == '?':
               jobStage = 'Job'
            if not localJobId in reportedJobs:
               reportedJobs[localJobId] = {}
            reportedJobs[localJobId][instanceId] = (runName,jobQueue,jobSite,jobStatusMessage,jobStage)

      return(float(reportTime),reportedJobs)


   def queryUserActiveJobPid(self,
                             hubUserId,
                             localJobId):
      attributes = [hubUserId,localJobId]
      queryMessage = 'P:' + ' '.join([str(attribute) for attribute in attributes]) 
      nTry,report,reportTime = self.requestMessageTimestampResponse(queryMessage,
                                                                    self.fixedBufferSize,
                                                                    self.fixedBufferSize)

      if nTry > 1:
         log("confirmation: P(%d):%s" % (nTry,reportTime.strip()))

      activeJobPid = None
      if len(report) > 0:
         activeJobPid = int(report)

      return(activeJobPid)


   def __getJobStatusMessage(self,
                             jobStatus):
      try:
         jobStatusMessage = self.jobStatus[jobStatus]['message']
      except:
         log(traceback.format_exc())
         jobStatusMessage = 'Unknown Status(%s)' % (jobStatus)

      return(jobStatusMessage)


   def __getJobStatusState(self,
                           jobStatus):
# 'waiting'
# 'aborted'
# 'setting up'
# 'failed'
# 'executing'
# 'finished'

      try:
         jobStatusState = self.jobStatus[jobStatus]['state']
      except:
         log(traceback.format_exc())
         jobStatusState = 'Unknown State(%s)' % (jobStatus)

      return(jobStatusState)


   def __getJobStatusReportOrder(self,
                                 jobStatusState):
      try:
         jobStatusReportOrder = self.jobStatusReportOrders[jobStatusState]
      except:
         jobStatusReportOrder = 99

      return(jobStatusReportOrder)


   def waitForBatchJobs(self,
                        waitForJobsInfo,
                        abortGlobal):
      completeRemoteJobIndexes = []

      minimumDelay = 5       #  5 10 20 40 80 160 320
      maximumDelay = 320
      updateFrequency = 5
      maximumReportDelay = 320

      delayTime = 0
      sleepTime = minimumDelay
      nDelays = 0
      timeLastReported = delayTime

      previousJobStatuses = {}
      previousJobStages   = {}
      previousJobSites    = {}

      incompleteJobs = 0
      for instance in waitForJobsInfo:
         if waitForJobsInfo[instance]['recentJobStatus'] != 'D':
            if waitForJobsInfo[instance]['isBatchJob']:
               siteMonitorDesignator = waitForJobsInfo[instance]['siteMonitorDesignator']
               remoteJobId           = waitForJobsInfo[instance]['remoteJobId']
               knownSite             = waitForJobsInfo[instance]['knownSite']
               currentJobStatus,currentJobStage,currentJobSite = self.queryRemoteJobStatus(siteMonitorDesignator,remoteJobId)
               if currentJobSite == "" or currentJobSite == '?':
                  if knownSite != "":
                     currentJobSite = knownSite
               if currentJobSite != "" and currentJobSite != '?':
                  waitForJobsInfo[instance]['recentJobSite'] = currentJobSite
               jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
               if currentJobSite == "" or currentJobSite == '?':
                  log("status:%s %s" % (currentJobStage,currentJobStatus))
                  self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                        time.ctime()))
               else:
                  log("status:%s %s %s" % (currentJobStage,currentJobStatus,currentJobSite))
                  self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                              currentJobSite,time.ctime()))
            else:
               currentJobStatus = 'D'
               currentJobStage  = 'Job'
               currentJobSite   = ''

            waitForJobsInfo[instance]['recentJobStatus'] = currentJobStatus
            if currentJobStatus == 'D':
               completeRemoteJobIndexes.append(instance)
            else:
               incompleteJobs += 1

            previousJobStatuses[instance] = currentJobStatus
            previousJobStages[instance]   = currentJobStage
            previousJobSites[instance]    = currentJobSite

      while (len(completeRemoteJobIndexes) == 0) and (incompleteJobs > 0) and not abortGlobal['abortAttempted']:
         nDelays += 1
         time.sleep(sleepTime)
         delayTime += sleepTime
         if nDelays == updateFrequency:
            nDelays = 0
            sleepTime *= 2
            if sleepTime > maximumDelay:
               sleepTime = maximumDelay

         for instance in waitForJobsInfo:
            if waitForJobsInfo[instance]['recentJobStatus'] != 'D':
               if waitForJobsInfo[instance]['isBatchJob']:
                  siteMonitorDesignator = waitForJobsInfo[instance]['siteMonitorDesignator']
                  remoteJobId           = waitForJobsInfo[instance]['remoteJobId']
                  knownSite             = waitForJobsInfo[instance]['knownSite']
                  previousJobStatus = previousJobStatuses[instance]
                  previousJobStage  = previousJobStages[instance]
                  previousJobSite   = previousJobSites[instance]
                  currentJobStatus,currentJobStage,currentJobSite = self.queryRemoteJobStatus(siteMonitorDesignator,remoteJobId)
                  if currentJobSite == "" or currentJobSite == '?':
                     if knownSite != "":
                        currentJobSite = knownSite
                  if currentJobSite != "" and currentJobSite != '?':
                     waitForJobsInfo[instance]['recentJobSite'] = currentJobSite
                  if currentJobStatus != previousJobStatus or \
                     currentJobStage != previousJobStage or \
                     currentJobSite != previousJobSite:
                     jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
                     if currentJobSite == "" or currentJobSite == '?':
                        log("status:%s %s" % (currentJobStage,currentJobStatus))
                        self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                              time.ctime()))
                     else:
                        log("status:%s %s %s" % (currentJobStage,currentJobStatus,currentJobSite))
                        self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                    currentJobSite,time.ctime()))
                     waitForJobsInfo[instance]['recentJobStatus'] = currentJobStatus
                     if currentJobStatus == 'D':
                        completeRemoteJobIndexes.append(instance)
                        incompleteJobs -= 1
                     previousJobStatuses[instance] = currentJobStatus
                     previousJobStages[instance]   = currentJobStage
                     previousJobSites[instance]    = currentJobSite
                     timeLastReported = delayTime
                     sleepTime = minimumDelay
                     nDelays = 0
                  else:
                     if delayTime >= (timeLastReported + maximumReportDelay):
                        jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
                        if currentJobSite == "" or currentJobSite == '?':
                           self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                 time.ctime()))
                        else:
                           self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                       currentJobSite,time.ctime()))
                        timeLastReported = delayTime

      log("waitForBatchJobs: nCompleteRemoteJobIndexes = %d, nIncompleteJobs = %d, abortGlobal = %s" % \
                    (len(completeRemoteJobIndexes),incompleteJobs,abortGlobal['abortAttempted']))

      del previousJobStatuses
      del previousJobStages
      del previousJobSites

      return(completeRemoteJobIndexes)


   def __updateSweepStatus(self,
                           parameterCombinationsPath,
                           waitForJobsInfo):
      parameterCombinationsDir  = os.path.dirname(parameterCombinationsPath)
      parameterCombinationsBase = os.path.basename(parameterCombinationsPath)
      if '.' in parameterCombinationsBase:
         parameterCombinationsBase = parameterCombinationsBase.split('.')[0]
      tmpParameterCombinationsFile = parameterCombinationsBase + '.tmp'
      tmpParameterCombinationsPath = os.path.join(parameterCombinationsDir,tmpParameterCombinationsFile)
      copyTmpFile = False

      if os.path.exists(parameterCombinationsPath):
         fpCSVIn = open(parameterCombinationsPath,'rb')
         if fpCSVIn:
            csvReader = csv.reader(fpCSVIn)
            fpCSVOut = open(tmpParameterCombinationsPath,'wb')
            if fpCSVOut:
               parameterNames = csvReader.next()
               while len(parameterNames) > 0 and parameterNames[0][0] == '#':
                  parameterNames = csvReader.next()

               parameterCombinations = {}
               nCompleted = 0
               nInstances = 0
               for parameterCombination in csvReader:
                  instance = int(parameterCombination[0])
                  if instance in waitForJobsInfo:
                     jobStatusState       = self.__getJobStatusState(waitForJobsInfo[instance]['recentJobStatus'])
                     jobStatusReportOrder = self.__getJobStatusReportOrder(jobStatusState)
                     parameterCombination[1] = jobStatusState
                  else:
                     jobStatusState       = parameterCombination[1]
                     jobStatusReportOrder = self.__getJobStatusReportOrder(jobStatusState)
                  if not jobStatusReportOrder in parameterCombinations:
                     parameterCombinations[jobStatusReportOrder] = []
                  parameterCombinations[jobStatusReportOrder].append(parameterCombination)
                  if jobStatusState in ['finished','failed','aborted']:
                     nCompleted += 1
                  nInstances += 1
               jobStatusReportOrders = parameterCombinations.keys()
               jobStatusReportOrders.sort()

               csvWriter = csv.writer(fpCSVOut)
               csvWriter.writerow(('# command: ' + self.enteredCommand,))
               csvWriter.writerow(('# started: ' + self.startDate,))
               if self.finishDate:
                  csvWriter.writerow(('# finished: ' + self.finishDate,))
               csvWriter.writerow(('# completed: %d/%d jobs' % (nCompleted,nInstances),))
               csvWriter.writerow(parameterNames)
               for jobStatusReportOrder in jobStatusReportOrders:
                  for parameterCombination in parameterCombinations[jobStatusReportOrder]:
                     csvWriter.writerow(parameterCombination)
               fpCSVOut.close()
               copyTmpFile = True
            fpCSVIn.close()

      if copyTmpFile:
         os.rename(tmpParameterCombinationsPath,parameterCombinationsPath)


   def waitForSweepJobs(self,
                        waitForJobsInfo,
                        parameterCombinationsPath,
                        isClientTTY,
                        abortGlobal):
      completeRemoteJobIndexes = []

      minimumDelay = 5       #  5 10 20 40 80 160 320
      maximumDelay = 320
      updateFrequency = 5
      maximumReportDelay = 320

      delayTime = 0
      sleepTime = minimumDelay
      nDelays = 0
      timeLastReported = delayTime

      previousJobStatuses = {}
      previousJobStages   = {}
      previousJobSites    = {}

      incompleteJobs = 0
      for instance in waitForJobsInfo:
         if waitForJobsInfo[instance]['recentJobStatus'] != 'D':
            if waitForJobsInfo[instance]['isBatchJob']:
               siteMonitorDesignator = waitForJobsInfo[instance]['siteMonitorDesignator']
               remoteJobId           = waitForJobsInfo[instance]['remoteJobId']
               knownSite             = waitForJobsInfo[instance]['knownSite']
               currentJobStatus,currentJobStage,currentJobSite = self.queryRemoteJobStatus(siteMonitorDesignator,remoteJobId)
               if currentJobSite == "" or currentJobSite == '?':
                  if knownSite != "":
                     currentJobSite = knownSite
               if currentJobSite != "" and currentJobSite != '?':
                  waitForJobsInfo[instance]['recentJobSite'] = currentJobSite
               jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
               if currentJobSite == "" or currentJobSite == '?':
                  log("status:%s %s" % (currentJobStage,currentJobStatus))
               else:
                  log("status:%s %s %s" % (currentJobStage,currentJobStatus,currentJobSite))
               if not isClientTTY:
                  if currentJobSite == "" or currentJobSite == '?':
                     self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                           time.ctime()))
                  else:
                     self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                 currentJobSite,time.ctime()))
            else:
               currentJobStatus = 'D'
               currentJobStage  = 'Job'
               currentJobSite   = ''

            waitForJobsInfo[instance]['recentJobStatus'] = currentJobStatus
            if currentJobStatus == 'D':
               completeRemoteJobIndexes.append(instance)
            else:
               incompleteJobs += 1

            previousJobStatuses[instance] = currentJobStatus
            previousJobStages[instance]   = currentJobStage
            previousJobSites[instance]    = currentJobSite

      self.__updateSweepStatus(parameterCombinationsPath,waitForJobsInfo)

      while (len(completeRemoteJobIndexes) == 0) and (incompleteJobs > 0) and not abortGlobal['abortAttempted']:
         nDelays += 1
         time.sleep(sleepTime)
         delayTime += sleepTime
         if nDelays == updateFrequency:
            nDelays = 0
            sleepTime *= 2
            if sleepTime > maximumDelay:
               sleepTime = maximumDelay

         for instance in waitForJobsInfo:
            if waitForJobsInfo[instance]['recentJobStatus'] != 'D':
               if waitForJobsInfo[instance]['isBatchJob']:
                  siteMonitorDesignator = waitForJobsInfo[instance]['siteMonitorDesignator']
                  remoteJobId           = waitForJobsInfo[instance]['remoteJobId']
                  knownSite             = waitForJobsInfo[instance]['knownSite']
                  previousJobStatus = previousJobStatuses[instance]
                  previousJobStage  = previousJobStages[instance]
                  previousJobSite   = previousJobSites[instance]
                  currentJobStatus,currentJobStage,currentJobSite = self.queryRemoteJobStatus(siteMonitorDesignator,remoteJobId)
                  if currentJobSite == "" or currentJobSite == '?':
                     if knownSite != "":
                        currentJobSite = knownSite
                  if currentJobSite != "" and currentJobSite != '?':
                     waitForJobsInfo[instance]['recentJobSite'] = currentJobSite
                  if currentJobStatus != previousJobStatus or \
                     currentJobStage != previousJobStage or \
                     currentJobSite != previousJobSite:
                     jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
                     if currentJobSite == "" or currentJobSite == '?':
                        log("status:%s %s" % (currentJobStage,currentJobStatus))
                     else:
                        log("status:%s %s %s" % (currentJobStage,currentJobStatus,currentJobSite))
                     if not isClientTTY:
                        if currentJobSite == "" or currentJobSite == '?':
                           self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                 time.ctime()))
                        else:
                           self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                       currentJobSite,time.ctime()))
                     waitForJobsInfo[instance]['recentJobStatus'] = currentJobStatus
                     if currentJobStatus == 'D':
                        completeRemoteJobIndexes.append(instance)
                        incompleteJobs -= 1
                     previousJobStatuses[instance] = currentJobStatus
                     previousJobStages[instance]   = currentJobStage
                     previousJobSites[instance]    = currentJobSite
                     timeLastReported = delayTime
                     sleepTime = minimumDelay
                     nDelays = 0
                  else:
                     if delayTime >= (timeLastReported + maximumReportDelay):
                        jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
                        if not isClientTTY:
                           if currentJobSite == "" or currentJobSite == '?':
                              self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                    time.ctime()))
                           else:
                              self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                          currentJobSite,
                                                                                                          time.ctime()))
                        timeLastReported = delayTime

         if incompleteJobs == 0:
            if not self.finishDate:
               self.finishDate = time.strftime("%a %b %e %X %Z %Y")

         self.__updateSweepStatus(parameterCombinationsPath,waitForJobsInfo)

      log("waitForSweepJobs: nCompleteRemoteJobIndexes = %d, nIncompleteJobs = %d, abortGlobal = %s" % \
                    (len(completeRemoteJobIndexes),incompleteJobs,abortGlobal['abortAttempted']))

      del previousJobStatuses
      del previousJobStages
      del previousJobSites

      return(completeRemoteJobIndexes)


   def __updatePegasusWorkflowStatus(self,
                                     parameterCombinationsPath,
                                     wfInstances):
      parameterCombinationsDir  = os.path.dirname(parameterCombinationsPath)
      parameterCombinationsBase = os.path.basename(parameterCombinationsPath)
      if '.' in parameterCombinationsBase:
         parameterCombinationsBase = parameterCombinationsBase.split('.')[0]
      tmpParameterCombinationsFile = parameterCombinationsBase + '.tmp'
      tmpParameterCombinationsPath = os.path.join(parameterCombinationsDir,tmpParameterCombinationsFile)
      copyTmpFile = False

      if os.path.exists(parameterCombinationsPath):
         fpCSVIn = open(parameterCombinationsPath,'rb')
         if fpCSVIn:
            csvReader = csv.reader(fpCSVIn)
            fpCSVOut = open(tmpParameterCombinationsPath,'wb')
            if fpCSVOut:
               parameterNames = csvReader.next()
               while len(parameterNames) > 0 and parameterNames[0][0] == '#':
                  parameterNames = csvReader.next()

               parameterCombinations = {}
               nCompleted = 0
               nInstances = 0
               for parameterCombination in csvReader:
                  instance = int(parameterCombination[0])
                  if instance in wfInstances:
                     jobStatusState       = self.__getJobStatusState(wfInstances[instance]['jobStatus'])
                     jobStatusReportOrder = self.__getJobStatusReportOrder(jobStatusState)
                     parameterCombination[1] = jobStatusState
                  else:
                     jobStatusState       = parameterCombination[1]
                     jobStatusReportOrder = self.__getJobStatusReportOrder(jobStatusState)
                  if not jobStatusReportOrder in parameterCombinations:
                     parameterCombinations[jobStatusReportOrder] = []
                  parameterCombinations[jobStatusReportOrder].append(parameterCombination)
                  if jobStatusState in ['finished','failed','aborted']:
                     nCompleted += 1
                  nInstances += 1
               jobStatusReportOrders = parameterCombinations.keys()
               jobStatusReportOrders.sort()

               csvWriter = csv.writer(fpCSVOut)
               csvWriter.writerow(('# command: ' + self.enteredCommand,))
               csvWriter.writerow(('# started: ' + self.startDate,))
               if self.finishDate:
                  csvWriter.writerow(('# finished: ' + self.finishDate,))
               csvWriter.writerow(('# completed: %d/%d jobs' % (nCompleted,nInstances),))
               csvWriter.writerow(parameterNames)
               for jobStatusReportOrder in jobStatusReportOrders:
                  for parameterCombination in parameterCombinations[jobStatusReportOrder]:
                     csvWriter.writerow(parameterCombination)
               fpCSVOut.close()
               copyTmpFile = True
            fpCSVIn.close()

      if copyTmpFile:
         os.rename(tmpParameterCombinationsPath,parameterCombinationsPath)


   def waitForPegasusWorkflowJobs(self,
                                  waitForJobsInfo,
                                  nInstances,
                                  parameterCombinationsPath,
                                  isClientTTY,
                                  abortGlobal):
      completeRemoteJobIndexes = []

      minimumDelay = 5       #  5 10 20 40 80 160 320
      maximumDelay = 320
      updateFrequency = 5
      maximumReportDelay = 320

      delayTime = 0
      sleepTime = minimumDelay
      nDelays = 0
      timeLastReported = delayTime

      previousJobStatuses = {}
      previousJobStages   = {}
      previousJobSites    = {}

      executeInstance = nInstances+1
      incompleteJobs = 0
      if waitForJobsInfo[executeInstance]['recentJobStatus'] != 'D':
         if waitForJobsInfo[executeInstance]['isBatchJob']:
            siteMonitorDesignator = waitForJobsInfo[executeInstance]['siteMonitorDesignator']
            remoteJobId           = waitForJobsInfo[executeInstance]['remoteJobId']
            knownSite             = waitForJobsInfo[executeInstance]['knownSite']
            currentJobStatus,currentJobStage,currentJobSite,wfInstances = self.queryPegasusWorkflowStatus(siteMonitorDesignator,
                                                                                                          remoteJobId,nInstances)
            wfExitCodes = getInProgressPegasusJobExitCodes(waitForJobsInfo[executeInstance]['instanceDirectory'],
                                                           waitForJobsInfo[executeInstance]['scratchDirectory'])
            for instance in wfExitCodes:
               if wfExitCodes[instance] == 'EF':
                  if instance in wfInstances:
                     wfInstances[instance]['jobStatus'] = wfExitCodes[instance]
            del wfExitCodes
            if currentJobStatus == 'D':
               if not self.finishDate:
                  self.finishDate = time.strftime("%a %b %e %X %Z %Y")
            if currentJobSite == "" or currentJobSite == '?':
               if knownSite != "":
                  currentJobSite = knownSite
            if currentJobSite != "" and currentJobSite != '?':
               waitForJobsInfo[executeInstance]['recentJobSite'] = currentJobSite
            jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
            if currentJobSite == "" or currentJobSite == '?':
               log("status:%s %s" % (currentJobStage,currentJobStatus))
            else:
               log("status:%s %s %s" % (currentJobStage,currentJobStatus,currentJobSite))
            if not isClientTTY:
               if currentJobSite == "" or currentJobSite == '?':
                  self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                        time.ctime()))
               else:
                  self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                              currentJobSite,time.ctime()))
         else:
            currentJobStatus = 'D'
            currentJobStage  = 'Job'
            currentJobSite   = ''
            wfInstances      = {}
            if not self.finishDate:
               self.finishDate = time.strftime("%a %b %e %X %Z %Y")

         for wfInstance in wfInstances:
            if wfInstances[wfInstance]['jobStatus'] != waitForJobsInfo[wfInstance]['recentJobStatus']:
               waitForJobsInfo[wfInstance]['recentJobStatus'] = wfInstances[wfInstance]['jobStatus']
            if ';' in wfInstances[wfInstance]['jobStage']:
               wfCurrentJobSite = wfInstances[wfInstance]['jobStage'].split(';')[1]
               if wfCurrentJobSite != "" and wfCurrentJobSite != '?':
                  waitForJobsInfo[wfInstance]['recentJobSite'] = wfCurrentJobSite
         self.__updatePegasusWorkflowStatus(parameterCombinationsPath,wfInstances)
         del wfInstances

         waitForJobsInfo[executeInstance]['recentJobStatus'] = currentJobStatus
         if currentJobStatus == 'D':
            completeRemoteJobIndexes.append(executeInstance)
         else:
            incompleteJobs += 1

         previousJobStatuses[executeInstance] = currentJobStatus
         previousJobStages[executeInstance]   = currentJobStage
         previousJobSites[executeInstance]    = currentJobSite

      while (len(completeRemoteJobIndexes) == 0) and (incompleteJobs > 0) and not abortGlobal['abortAttempted']:
         nDelays += 1
         time.sleep(sleepTime)
         delayTime += sleepTime
         if nDelays == updateFrequency:
            nDelays = 0
            sleepTime *= 2
            if sleepTime > maximumDelay:
               sleepTime = maximumDelay

         if waitForJobsInfo[executeInstance]['recentJobStatus'] != 'D':
            if waitForJobsInfo[executeInstance]['isBatchJob']:
               siteMonitorDesignator = waitForJobsInfo[executeInstance]['siteMonitorDesignator']
               remoteJobId           = waitForJobsInfo[executeInstance]['remoteJobId']
               knownSite             = waitForJobsInfo[executeInstance]['knownSite']
               previousJobStatus = previousJobStatuses[executeInstance]
               previousJobStage  = previousJobStages[executeInstance]
               previousJobSite   = previousJobSites[executeInstance]
               currentJobStatus,currentJobStage,currentJobSite,wfInstances = self.queryPegasusWorkflowStatus(siteMonitorDesignator,
                                                                                                             remoteJobId,nInstances)
               wfExitCodes = getInProgressPegasusJobExitCodes(waitForJobsInfo[executeInstance]['instanceDirectory'],
                                                              waitForJobsInfo[executeInstance]['scratchDirectory'])
               for instance in wfExitCodes:
                  if wfExitCodes[instance] == 'EF':
                     if instance in wfInstances:
                        wfInstances[instance]['jobStatus'] = wfExitCodes[instance]
               del wfExitCodes
               instanceChangedStatus = False
               if currentJobStatus == 'D':
                  if not self.finishDate:
                     self.finishDate = time.strftime("%a %b %e %X %Z %Y")
                     instanceChangedStatus = True
               for wfInstance in wfInstances:
                  if wfInstances[wfInstance]['jobStatus'] != waitForJobsInfo[wfInstance]['recentJobStatus']:
                     waitForJobsInfo[wfInstance]['recentJobStatus'] = wfInstances[wfInstance]['jobStatus']
                     instanceChangedStatus = True
                  if ';' in wfInstances[wfInstance]['jobStage']:
                     wfCurrentJobSite = wfInstances[wfInstance]['jobStage'].split(';')[1]
                     if wfCurrentJobSite != "" and wfCurrentJobSite != '?':
                        waitForJobsInfo[wfInstance]['recentJobSite'] = wfCurrentJobSite
               if instanceChangedStatus:
                  self.__updatePegasusWorkflowStatus(parameterCombinationsPath,wfInstances)
                  getInProgressPegasusJobStdFiles(waitForJobsInfo[executeInstance]['instanceDirectory'],
                                                  waitForJobsInfo[executeInstance]['scratchDirectory'])
               del wfInstances

               if currentJobSite == "" or currentJobSite == '?':
                  if knownSite != "":
                     currentJobSite = knownSite
               if currentJobSite != "" and currentJobSite != '?':
                  waitForJobsInfo[executeInstance]['recentJobSite'] = currentJobSite
               if currentJobStatus != previousJobStatus or \
                  currentJobStage != previousJobStage or \
                  currentJobSite != previousJobSite:
                  jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
                  if currentJobSite == "" or currentJobSite == '?':
                     log("status:%s %s" % (currentJobStage,currentJobStatus))
                  else:
                     log("status:%s %s %s" % (currentJobStage,currentJobStatus,currentJobSite))
                  if not isClientTTY:
                     if currentJobSite == "" or currentJobSite == '?':
                        self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                              time.ctime()))
                     else:
                        self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                    currentJobSite,time.ctime()))
                  waitForJobsInfo[executeInstance]['recentJobStatus'] = currentJobStatus
                  if currentJobStatus == 'D':
                     completeRemoteJobIndexes.append(executeInstance)
                     incompleteJobs -= 1
                  previousJobStatuses[executeInstance] = currentJobStatus
                  previousJobStages[executeInstance]   = currentJobStage
                  previousJobSites[executeInstance]    = currentJobSite
                  timeLastReported = delayTime
                  sleepTime = minimumDelay
                  nDelays = 0
               else:
                  if delayTime >= (timeLastReported + maximumReportDelay):
                     jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
                     if not isClientTTY:
                        if currentJobSite == "" or currentJobSite == '?':
                           self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                 time.ctime()))
                        else:
                           self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                       currentJobSite,time.ctime()))
                     timeLastReported = delayTime

      log("waitForPegasusWorkflowJobs: nCompleteRemoteJobIndexes = %d, nIncompleteJobs = %d, abortGlobal = %s" % \
                                        (len(completeRemoteJobIndexes),incompleteJobs,abortGlobal['abortAttempted']))

      del previousJobStatuses
      del previousJobStages
      del previousJobSites

      return(completeRemoteJobIndexes)


   def waitForKilledBatchJobs(self,
                              waitForJobsInfo):
      minimumDelay = 5       #  5 10 20 40 80 160 320
      maximumDelay = 30
      updateFrequency = 5
      maximumReportDelay = 30

      delayTime = 0
      sleepTime = minimumDelay
      nDelays = 0
      timeLastReported = delayTime

      previousJobStatuses = {}
      previousJobStages   = {}
      previousJobSites    = {}

      incompleteJobs = 0
      for instance in waitForJobsInfo:
         if waitForJobsInfo[instance]['recentJobStatus'] == 'K':
            if waitForJobsInfo[instance]['isBatchJob']:
               siteMonitorDesignator = waitForJobsInfo[instance]['siteMonitorDesignator']
               remoteJobId           = waitForJobsInfo[instance]['remoteJobId']
               knownSite             = waitForJobsInfo[instance]['knownSite']
               currentJobStatus,currentJobStage,currentJobSite = self.queryRemoteJobStatus(siteMonitorDesignator,remoteJobId)

               if currentJobSite == "" or currentJobSite == '?':
                  if knownSite != "":
                     currentJobSite = knownSite
               if currentJobSite != "" and currentJobSite != '?':
                  waitForJobsInfo[instance]['recentJobSite'] = currentJobSite
               jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
               if jobStatusMessage == 'Unknown Status':
                  log("waitForKilledBatchJobs: siteMonitor= %s,remoteJobId= %s,jobStatus= %s,jobStage= %s,jobSite= %s" % \
                                        (siteMonitorDesignator,remoteJobId,currentJobStatus,currentJobStage,currentJobSite))
               if currentJobSite == "" or currentJobSite == '?':
                  log("status:%s %s" % (currentJobStage,currentJobStatus))
                  self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                        time.ctime()))
               else:
                  log("status:%s %s %s" % (currentJobStage,currentJobStatus,currentJobSite))
                  self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                              currentJobSite,time.ctime()))
            else:
               currentJobStatus = 'D'
               currentJobStage  = 'Job'
               currentJobSite   = ''

            if currentJobStatus == 'D':
               waitForJobsInfo[instance]['recentJobStatus'] = 'KD'
            else:
               incompleteJobs += 1

            previousJobStatuses[instance] = currentJobStatus
            previousJobStages[instance]   = currentJobStage
            previousJobSites[instance]    = currentJobSite

      while incompleteJobs > 0:
         nDelays += 1
         time.sleep(sleepTime)
         delayTime += sleepTime
         if nDelays == updateFrequency:
            nDelays = 0
            sleepTime *= 2
            if sleepTime > maximumDelay:
               sleepTime = maximumDelay

         for instance in waitForJobsInfo:
            if waitForJobsInfo[instance]['recentJobStatus'] == 'K':
               if waitForJobsInfo[instance]['isBatchJob']:
                  siteMonitorDesignator = waitForJobsInfo[instance]['siteMonitorDesignator']
                  remoteJobId           = waitForJobsInfo[instance]['remoteJobId']
                  knownSite             = waitForJobsInfo[instance]['knownSite']
                  previousJobStatus = previousJobStatuses[instance]
                  previousJobStage  = previousJobStages[instance]
                  previousJobSite   = previousJobSites[instance]
                  currentJobStatus,currentJobStage,currentJobSite = self.queryRemoteJobStatus(siteMonitorDesignator,remoteJobId)

                  if currentJobSite == "" or currentJobSite == '?':
                     if knownSite != "":
                        currentJobSite = knownSite
                  if currentJobSite != "" and currentJobSite != '?':
                     waitForJobsInfo[instance]['recentJobSite'] = currentJobSite
                  if currentJobStatus != previousJobStatus or \
                     currentJobStage != previousJobStage or \
                     currentJobSite != previousJobSite:
                     jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
                     if jobStatusMessage == 'Unknown Status':
                        log("waitForKilledBatchJobs: siteMonitor= %s,remoteJobId= %s,jobStatus= %s,jobStage= %s,jobSite= %s" % \
                                              (siteMonitorDesignator,remoteJobId,currentJobStatus,currentJobStage,currentJobSite))
                     if currentJobSite == "" or currentJobSite == '?':
                        log("status:%s %s" % (currentJobStage,currentJobStatus))
                        self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                              time.ctime()))
                     else:
                        log("status:%s %s %s" % (currentJobStage,currentJobStatus,currentJobSite))
                        self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                    currentJobSite,time.ctime()))
                     if currentJobStatus == 'D':
                        waitForJobsInfo[instance]['recentJobStatus'] = 'KD'
                        incompleteJobs -= 1
                     previousJobStatuses[instance] = currentJobStatus
                     previousJobStages[instance]   = currentJobStage
                     previousJobSites[instance]    = currentJobSite
                     timeLastReported = delayTime
                     sleepTime = minimumDelay
                     nDelays = 0
                  else:
                     if delayTime >= (timeLastReported + maximumReportDelay):
                        jobStatusMessage = self.__getJobStatusMessage(currentJobStatus)
                        if currentJobSite == "" or currentJobSite == '?':
                           self.__writeToStdout("(%s) %s %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                 time.ctime()))
                        else:
                           self.__writeToStdout("(%s) %s %s at %s %s\n" % (remoteJobId,currentJobStage,jobStatusMessage, \
                                                                                                       currentJobSite,time.ctime()))
                        timeLastReported = delayTime

      del previousJobStatuses
      del previousJobStages
      del previousJobSites


