#!/usr/bin/env python
#
# @package      hubzero-submit-monitors
# @file         BatchMonitors/monitorLL.py
# @author       Steven Clark <clarks@purdue.edu>
# @copyright    Copyright (c) 2004-2014 HUBzero Foundation, LLC.
# @license      http://www.gnu.org/licenses/lgpl-3.0.html LGPLv3
#
# Copyright (c) 2004-2014 HUBzero Foundation, LLC.
#
# This file is part of: The HUBzero(R) Platform for Scientific Collaboration
#
# The HUBzero(R) Platform for Scientific Collaboration (HUBzero) is free
# software: you can redistribute it and/or modify it under the terms of
# the GNU Lesser General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# HUBzero is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# HUBzero is a registered trademark of HUBzero Foundation, LLC.
#
# ----------------------------------------------------------------------
#  monitorLL.py
#
#  script which monitors the LL queue and reports changes in job status
#
import sys
import os
import select
import subprocess
import re
import signal

from LogMessage import openLog, log

SITEDESIGNATOR     = "llHost"
MONITORROOT        = os.path.join(os.sep,'home','llUser','Submit','llHost')
QSTATCOMMAND       = "llq -W -u llUser -f %jn %st"
MONITORLOGLOCATION = os.path.join(os.sep,'var','log','submit','monitors')
MONITORLOGFILENAME = "monitorLL.log"
LOGPATH            = os.path.join(MONITORLOGLOCATION,MONITORLOGFILENAME)
HISTORYFILENAME    = "monitorLL.history"
HISTORYFILEPATH    = os.path.join(MONITORROOT,HISTORYFILENAME)

SLEEPTIME       = 10
PAUSETIME       = 5.
MAXIMUMIDLETIME = 30*60


class QueueMonitor:
   def __init__(self,
                siteDesignator,
                qstatCommand,
                historyFilePath,
                sleepTime,
                pauseTime,
                maximumIdleTime):
      self.siteDesignator                = siteDesignator
      self.qstatCommand                  = qstatCommand
      self.historyFilePath               = historyFilePath
      self.sleepTime                     = sleepTime
      self.pauseTime                     = pauseTime
      self.maximumConsecutiveEmptyQueues = maximumIdleTime/sleepTime

      self.historyFile = None
      self.updates     = []
      self.activeJobs  = {}
      self.bufferSize  = 4096

      signal.signal(signal.SIGINT,self.sigINT_handler)
      signal.signal(signal.SIGHUP,self.sigHUP_handler)
      signal.signal(signal.SIGQUIT,self.sigQUIT_handler)
      signal.signal(signal.SIGABRT,self.sigABRT_handler)
      signal.signal(signal.SIGTERM,self.sigTERM_handler)


   def cleanup(self):
      if self.historyFile:
         self.historyFile.close()
         self.historyFile = None


   def sigGEN_handler(self,
                      signalNumber,
                      frame):
      self.cleanup()
      log("%s monitor stopped" % (self.siteDesignator))
      sys.exit(1)


   def sigINT_handler(self,
                      signalNumber,
                      frame):
      log("Received SIGINT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigHUP_handler(self,
                      signalNumber,
                      frame):
      log("Received SIGHUP!")
      self.sigGEN_handler(signalNumber,frame)


   def sigQUIT_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGQUIT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigABRT_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGABRT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigTERM_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGTERM!")
      self.sigGEN_handler(signalNumber,frame)


   def openHistory(self,
                   accessMode):
      if accessMode == 'r':
         if os.path.isfile(self.historyFilePath):
            self.historyFile = open(self.historyFilePath,accessMode)
         else:
            self.historyFile = None
      else:
         self.historyFile = open(self.historyFilePath,accessMode)


   def recordHistory(self,
                     jobId):
      self.historyFile.write("%s:%s %s %s\n" % (self.siteDesignator,str(jobId),self.activeJobs[jobId][0], \
                                                                               self.activeJobs[jobId][1]))
      self.historyFile.flush()
      self.updates.append(str(jobId) + ' ' + self.activeJobs[jobId][0] + ' ' + \
                                             self.activeJobs[jobId][1])


   def loadHistory(self):
      self.openHistory('r')
      if self.historyFile:
         records = self.historyFile.readlines()
         for record in records:
            colon = record.find(':')
            if colon > 0:
               jobState = record[colon+1:].split()
               jobId  = jobState[0]
               status = jobState[1]
               stage  = 'Simulation'
               if status == 'D':
                  if jobId in self.activeJobs:
                     del self.activeJobs[jobId]
               else:
                  self.activeJobs[jobId] = (status,stage)
         self.historyFile.close()
         self.historyFile = None


   def saveHistory(self):
      self.openHistory('w')
      if self.historyFile:
         for activeJob in self.activeJobs:
            self.historyFile.write("%s:%s %s %s\n" % (self.siteDesignator,str(activeJob),self.activeJobs[activeJob][0], \
                                                                                         self.activeJobs[activeJob][1]))
         self.historyFile.close()
         self.historyFile = None


   def executeQstatCommand(self,
                           command):
      child = subprocess.Popen(command,shell=True,bufsize=self.bufferSize,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               close_fds=True)
      childPid   = child.pid
      childout   = child.stdout
      childoutFd = childout.fileno()
      childerr   = child.stderr
      childerrFd = childerr.fileno()

      outEOF = False
      errEOF = False

      outData = []
      errData = []

      while True:
         toCheck = []
         if not outEOF:
            toCheck.append(childoutFd)
         if not errEOF:
            toCheck.append(childerrFd)
         ready = select.select(toCheck,[],[],self.sleepTime) # wait for input
         if childoutFd in ready[0]:
            outChunk = os.read(childoutFd,self.bufferSize)
            if outChunk == '':
               outEOF = True
            outData.append(outChunk)

         if childerrFd in ready[0]:
            errChunk = os.read(childerrFd,self.bufferSize)
            if errChunk == '':
               errEOF = True
            errData.append(errChunk)

         if len(ready[0]) == 0:
            os.kill(child.pid,signal.SIGTERM)

         if outEOF and errEOF:
            break

      pid,err = os.waitpid(childPid,0)
      if err != 0:
         if os.WIFSIGNALED(err):
            log("%s failed w/ exit code %d signal %d" % (command,os.WEXITSTATUS(err),os.WTERMSIG(err)))
         else:
            if os.WIFEXITED(err):
               err = os.WEXITSTATUS(err)
            log("%s failed w/ exit code %d" % (command,err))
         log("%s" % ("".join(errData)))

      return(err,"".join(outData),"".join(errData))


   def monitorQ(self):
      self.openHistory('a')
      consecutiveEmptyQueues = 0
      lastReportedActiveJobCount = 0

      toCheck = []
      toCheck.append(sys.stdin.fileno())
      while 1:
         activeJobCount = len(self.activeJobs)
         if activeJobCount != lastReportedActiveJobCount:
            log("%d monitored jobs" % (activeJobCount))
         lastReportedActiveJobCount = activeJobCount

         self.updates  = []
         currentJobs   = {}
         completedJobs = []

         delayTime = 0
         while delayTime <= self.sleepTime:
            if os.getppid() == 1:
               os.kill(os.getpid(),signal.SIGTERM)

            ready = select.select(toCheck,[],[],self.pauseTime) # wait for input
            if sys.stdin.fileno() in ready[0]:
               newJob = sys.stdin.readline().strip()
               if newJob != "":
                  if not newJob in self.activeJobs:
                     self.activeJobs[newJob] = ('N','Job')
                     self.recordHistory(newJob)
                     self.activeJobs[newJob] = ('n','Job')
                  consecutiveEmptyQueues = 0
            delayTime += self.pauseTime

#Id                       Owner      Submitted   ST PRI Class        Running On
#------------------------ ---------- ----------- -- --- ------------ -----------
#fengpfs.79854.0          hpc2        2/2  11:24 I  50  long

#1 job step(s) in query, 1 waiting, 0 pending, 0 running, 0 held, 0 preempted

#http://publib.boulder.ibm.com/infocenter/clresctr/vxrx/index.jsp?topic=/com.ibm.cluster.loadl.doc/loadl34/am2ug30506.html
#Canceled            CA  The job was canceled either by a user or by an administrator.
#Checkpointing       CK  Indicates that a checkpoint has been initiated.
#Completed            C  The job has completed.
#Complete Pending    CP  The job is in the process of being completed.
#Deferred             D  The job will not be assigned to a machine until a specified date.
#                        This date may have been specified by the user in the job command file,
#                        or may have been generated by the negotiator because a parallel job
#                        did not accumulate enough machines to run the job. Only the central
#                        manager places a job in the Deferred state.
#Idle                 I  The job is being considered to run on a machine,
#                        though no machine has been selected.
#Not Queued          NQ  The job is not being considered to run on a machine. A job can enter
#                        this state because the associated Schedd is down, the user or group
#                        associated with the job is at its maximum maxqueued or maxidle value,
#                        or because the job has a dependency which cannot be determined. For
#                        more information on these keywords, see Controlling the mix of idle
#                        and running jobs. (Only the central manager places a job in the NotQueued state.)
#Not Run             NR  The job will never be run because a dependency associated with the job was found to be false.
#Pending              P  The job is in the process of starting on one or more machines.
#                        (The negotiator indicates this state until the Schedd acknowledges
#                        that it has received the request to start the job. Then the negotiator
#                        changes the state of the job to Starting. The Schedd indicates the
#                        Pending state until all startd machines have acknowledged receipt of
#                        the start request. The Schedd then changes the state of the job to Starting.)
#Preempted            E  The job is preempted. This state applies only when LoadLeveler uses the
#                        suspend method to preempt the job.
#Preempt Pending     EP  The job is in the process of being preempted. This state applies only when
#                        LoadLeveler uses the suspend method to preempt the job.
#Rejected             X  The job is rejected.
#Reject Pending      XP  The job did not start. Possible reasons why a job is rejected are: job
#                        requirements were not met on the target machine, or the user ID of the person
#                        running the job is not valid on the target machine. After a job leaves the
#                        Reject Pending state, it is moved into one of the following states:
#                        Idle, User Hold, or Removed.
#Removed             RM  The job was stopped by LoadLeveler.
#Remove Pending      RP  The job is in the process of being removed, but not all associated machines
#                        have acknowledged the removal of the job.
#Resume Pending      MP  The job is in the process of being resumed.
#Running              R  The job is running: the job was dispatched and has started on the designated machine.
#Starting            ST  The job is starting: the job was dispatched, was received by the target machine,
#                        and LoadLeveler is setting up the environment in which to run the job. For a
#                        parallel job, LoadLeveler sets up the environment on all required nodes. See
#                        the description of the Pending state for more information on when the negotiator
#                        or the Schedd daemon moves a job into the Starting state.
#System Hold          S  The job has been put in system hold.
#Terminated          TX  If the negotiator and Schedd daemons experience communication problems,
#                        they may be temporarily unable to exchange information concerning the status
#                        of jobs in the system. During this period of time, some of the jobs may
#                        actually complete and therefore be removed from the Schedd's list of active jobs.
#                        When communication resumes between the two daemons, the negotiator will move
#                        such jobs to the Terminated state, where they will remain for a set period of
#                        time (specified by the NEGOTIATOR_REMOVE_COMPLETED keyword in the configuration file).
#                        When this time has passed, the negotiator will remove the jobs from its active list.
#User & System Hold  HS  The job has been put in both system hold and user hold.
#User Hold            H  The job has been put in user hold.
#Vacated              V  The job started but did not complete. The negotiator will reschedule the job
#                        (provided the job is allowed to be rescheduled). Possible reasons why a job moves
#                        to the Vacated state are: the machine where the job was running was flushed,
#                        the VACATE expression in the configuration file evaluated to True, or LoadLeveler
#                        detected a condition indicating the job needed to be vacated. For more information
#                        on the VACATE expression, see Managing job status through control expressions.
#Vacate Pending      VP  The job is in the process of being vacated.

         errStatus,qstatOutput,qstatError = self.executeQstatCommand(self.qstatCommand)
         if errStatus == 0:
            jobs = qstatOutput.splitlines()
            for job in jobs:
               if re.match(".*\.[0-9]+",job):
                  jobId,status = job.split()
                  if   status == 'D':
                     status = 'DF'
                  elif status == 'E':
                     status = 'PT'
                  elif status == 'X':
                     status = 'RJ'
                  elif status == 'S':
                     status = 'SH'
                  stage  = 'Simulation'
                  currentJobs[jobId] = (status,stage)

            if len(currentJobs) == 0:
               consecutiveEmptyQueues += 1
            else:
               consecutiveEmptyQueues = 0

            for activeJob in self.activeJobs:
               if self.activeJobs[activeJob][0] == 'n':
                  self.activeJobs[activeJob] = ('N','Job')
               else:
                  if not activeJob in currentJobs:
                     self.activeJobs[activeJob] = ('D',self.activeJobs[activeJob][1])
                     self.recordHistory(activeJob)
                     completedJobs.append(activeJob)

            for currentJob in currentJobs:
               if   not currentJob in self.activeJobs:
                  self.activeJobs[currentJob] = currentJobs[currentJob]
                  self.recordHistory(currentJob)
               elif currentJobs[currentJob] != self.activeJobs[currentJob]:
                  self.activeJobs[currentJob] = currentJobs[currentJob]
                  self.recordHistory(currentJob)
               if self.activeJobs[currentJob][0] == 'D':
                  completedJobs.append(currentJob)

            for completedJob in completedJobs:
               del self.activeJobs[completedJob]

            del currentJobs
            del completedJobs

            if len(self.updates) > 0:
               updateMessage = str(len(self.updates)) + ' ' + self.siteDesignator + ':' + ':'.join(self.updates)
               sys.stdout.write("%s\n" % (updateMessage))
               sys.stdout.flush()

            del self.updates

            if self.historyFile:
               self.historyFile.close()
               self.historyFile = None
               self.saveHistory()
               self.openHistory('a')

            if consecutiveEmptyQueues == self.maximumConsecutiveEmptyQueues:
               self.cleanup()
               log("%s monitor stopped" % (self.siteDesignator))
               sys.exit(0)
         else:
            log("Error %d in %s command:\n%s" % (errStatus,self.qstatCommand,qstatError))


if __name__ == '__main__':

   openLog(LOGPATH)

   log("%s monitor started" % (SITEDESIGNATOR))

   __queueMonitor__ = QueueMonitor(SITEDESIGNATOR,QSTATCOMMAND,HISTORYFILEPATH, \
                                   SLEEPTIME,PAUSETIME,MAXIMUMIDLETIME)

   __queueMonitor__.loadHistory()
   __queueMonitor__.saveHistory()
   __queueMonitor__.monitorQ()


