#!/usr/bin/env python
#
# @package      hubzero-submit-distributor
# @file         monitorSLURM.py
# @author       Steven Clark <clarks@purdue.edu>
# @copyright    Copyright (c) 2004-2012 HUBzero Foundation, LLC.
# @license      http://www.gnu.org/licenses/lgpl-3.0.html LGPLv3
#
# Copyright (c) 2004-2012 HUBzero Foundation, LLC.
#
# This file is part of: The HUBzero(R) Platform for Scientific Collaboration
#
# The HUBzero(R) Platform for Scientific Collaboration (HUBzero) is free
# software: you can redistribute it and/or modify it under the terms of
# the GNU Lesser General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# HUBzero is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# HUBzero is a registered trademark of HUBzero Foundation, LLC.
#
# ----------------------------------------------------------------------
#  monitorSLURM.py
#
#  script which monitors the SLURM queue and reports changes in job status
#
import sys
import os
import select
import subprocess
import signal

from LogMessage import openLog, log

SITEDESIGNATOR     = "slurmHost"
MONITORROOT        = os.path.join(os.sep,'home','slurmUser','Submit','slurmHost')
QSTATCOMMAND       = "squeue --noheader -o '%.i %t'"
MONITORLOGLOCATION = os.path.join(os.sep,'var','log','submit','monitors')
MONITORLOGFILENAME = "monitorSLURM.log"
LOGPATH            = os.path.join(MONITORLOGLOCATION,MONITORLOGFILENAME)
HISTORYFILENAME    = "monitorSLURM.history"
HISTORYFILEPATH    = os.path.join(MONITORROOT,HISTORYFILENAME)

SLEEPTIME       = 10
PAUSETIME       = 5.
MAXIMUMIDLETIME = 30*60


class QueueMonitor:
   def __init__(self,
                siteDesignator,
                qstatCommand,
                historyFilePath,
                sleepTime,
                pauseTime,
                maximumIdleTime):
      self.siteDesignator                = siteDesignator
      self.qstatCommand                  = qstatCommand
      self.historyFilePath               = historyFilePath
      self.sleepTime                     = sleepTime
      self.pauseTime                     = pauseTime
      self.maximumConsecutiveEmptyQueues = maximumIdleTime/sleepTime

      self.historyFile = None
      self.updates     = []
      self.activeJobs  = {}
      self.bufferSize  = 4096

      signal.signal(signal.SIGINT,self.sigINT_handler)
      signal.signal(signal.SIGHUP,self.sigHUP_handler)
      signal.signal(signal.SIGQUIT,self.sigQUIT_handler)
      signal.signal(signal.SIGABRT,self.sigABRT_handler)
      signal.signal(signal.SIGTERM,self.sigTERM_handler)


   def cleanup(self):
      if self.historyFile:
         self.historyFile.close()
         self.historyFile = None


   def sigGEN_handler(self,
                      signalNumber,
                      frame):
      self.cleanup()
      log("%s monitor stopped" % (self.siteDesignator))
      sys.exit(1)


   def sigINT_handler(self,
                      signalNumber,
                      frame):
      log("Received SIGINT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigHUP_handler(self,
                      signalNumber,
                      frame):
      log("Received SIGHUP!")
      self.sigGEN_handler(signalNumber,frame)


   def sigQUIT_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGQUIT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigABRT_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGABRT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigTERM_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGTERM!")
      self.sigGEN_handler(signalNumber,frame)


   def openHistory(self,
                   accessMode):
      if accessMode == 'r':
         if os.path.isfile(self.historyFilePath):
            self.historyFile = open(self.historyFilePath,accessMode)
         else:
            self.historyFile = None
      else:
         self.historyFile = open(self.historyFilePath,accessMode)


   def recordHistory(self,
                     jobId):
      self.historyFile.write("%s:%s %s %s\n" % (self.siteDesignator,str(jobId),self.activeJobs[jobId][0], \
                                                                               self.activeJobs[jobId][1]))
      self.historyFile.flush()
      self.updates.append(str(jobId) + ' ' + self.activeJobs[jobId][0] + ' ' + \
                                             self.activeJobs[jobId][1])


   def loadHistory(self):
      self.openHistory('r')
      if self.historyFile:
         records = self.historyFile.readlines()
         for record in records:
            colon = record.find(':')
            if colon > 0:
               jobState = record[colon+1:].split()
               jobId  = jobState[0]
               status = jobState[1]
               stage  = 'Simulation'
               if status == 'D':
                  if jobId in self.activeJobs:
                     del self.activeJobs[jobId]
               else:
                  self.activeJobs[jobId] = (status,stage)
         self.historyFile.close()
         self.historyFile = None


   def saveHistory(self):
      self.openHistory('w')
      if self.historyFile:
         for activeJob in self.activeJobs:
            self.historyFile.write("%s:%s %s %s\n" % (self.siteDesignator,str(activeJob),self.activeJobs[activeJob][0], \
                                                                                         self.activeJobs[activeJob][1]))
         self.historyFile.close()
         self.historyFile = None


   def executeQstatCommand(self,
                           command):
      child = subprocess.Popen(command,shell=True,bufsize=self.bufferSize,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               close_fds=True)
      childPid   = child.pid
      childout   = child.stdout
      childoutFd = childout.fileno()
      childerr   = child.stderr
      childerrFd = childerr.fileno()

      outEOF = False
      errEOF = False

      outData = []
      errData = []

      while True:
         toCheck = []
         if not outEOF:
            toCheck.append(childoutFd)
         if not errEOF:
            toCheck.append(childerrFd)
         ready = select.select(toCheck,[],[],self.sleepTime) # wait for input
         if childoutFd in ready[0]:
            outChunk = os.read(childoutFd,self.bufferSize)
            if outChunk == '':
               outEOF = True
            outData.append(outChunk)

         if childerrFd in ready[0]:
            errChunk = os.read(childerrFd,self.bufferSize)
            if errChunk == '':
               errEOF = True
            errData.append(errChunk)

         if len(ready[0]) == 0:
            os.kill(child.pid,signal.SIGTERM)

         if outEOF and errEOF:
            break

      pid,err = os.waitpid(childPid,0)
      if err != 0:
         if os.WIFSIGNALED(err):
            log("%s failed w/ exit code %d signal %d" % (command,os.WEXITSTATUS(err),os.WTERMSIG(err)))
         else:
            if os.WIFEXITED(err):
               err = os.WEXITSTATUS(err)
            log("%s failed w/ exit code %d" % (command,err))
         log("%s" % ("".join(errData)))

      return(err,"".join(outData),"".join(errData))


   def monitorQ(self):
      self.openHistory('a')
      consecutiveEmptyQueues = 0
      lastReportedActiveJobCount = 0

      toCheck = []
      toCheck.append(sys.stdin.fileno())
      while 1:
         activeJobCount = len(self.activeJobs)
         if activeJobCount != lastReportedActiveJobCount:
            log("%d monitored jobs" % (activeJobCount))
         lastReportedActiveJobCount = activeJobCount

         self.updates  = []
         currentJobs   = {}
         completedJobs = []

         delayTime = 0
         while delayTime <= self.sleepTime:
            if os.getppid() == 1:
               os.kill(os.getpid(),signal.SIGTERM)

            ready = select.select(toCheck,[],[],self.pauseTime) # wait for input
            if sys.stdin.fileno() in ready[0]:
               newJob = sys.stdin.readline().strip()
               if newJob != "":
                  if not newJob in self.activeJobs:
                     self.activeJobs[newJob] = ('N','Job')
                     self.recordHistory(newJob)
                     self.activeJobs[newJob] = ('n','Job')
                  consecutiveEmptyQueues = 0
            delayTime += self.pauseTime

   #      CA  CANCELLED       Job was explicitly cancelled by the user or system administrator.  The job may or may  not  have  been
   #                          initiated.
   #      CD  COMPLETED       Job has terminated all processes on all nodes.
   #      CG  COMPLETING      Job is in the process of completing. Some processes on some nodes may still be active.
   #      F   FAILED          Job terminated with non-zero exit code or other failure condition.
   #      NF  NODE_FAIL       Job terminated due to failure of one or more allocated nodes.
   #      PD  PENDING         Job is awaiting resource allocation.
   #      R   RUNNING         Job currently has an allocation.
   #      S   SUSPENDED       Job has an allocation, but execution has been suspended.
   #      TO  TIMEOUT         Job terminated upon reaching its time limit.

         errStatus,qstatOutput,qstatError = self.executeQstatCommand(self.qstatCommand)
         if errStatus == 0:
            jobs = qstatOutput.splitlines()
            for job in jobs:
               jobId,status = job.split()
               stage = 'Simulation'
               currentJobs[jobId] = (status,stage)

            if len(currentJobs) == 0:
               consecutiveEmptyQueues += 1
            else:
               consecutiveEmptyQueues = 0

            for activeJob in self.activeJobs:
               if self.activeJobs[activeJob][0] == 'n':
                  self.activeJobs[activeJob] = ('N','Job')
               else:
                  if not activeJob in currentJobs:
                     self.activeJobs[activeJob] = ('D',self.activeJobs[activeJob][1])
                     self.recordHistory(activeJob)
                     completedJobs.append(activeJob)

            for currentJob in currentJobs:
               if   not currentJob in self.activeJobs:
                  self.activeJobs[currentJob] = currentJobs[currentJob]
                  self.recordHistory(currentJob)
               elif currentJobs[currentJob] != self.activeJobs[currentJob]:
                  self.activeJobs[currentJob] = currentJobs[currentJob]
                  self.recordHistory(currentJob)
               if self.activeJobs[currentJob][0] == 'D':
                  completedJobs.append(currentJob)

            for completedJob in completedJobs:
               del self.activeJobs[completedJob]

            del currentJobs
            del completedJobs

            if len(self.updates) > 0:
               updateMessage = str(len(self.updates)) + ' ' + self.siteDesignator + ':' + ':'.join(self.updates)
               sys.stdout.write("%s\n" % (updateMessage))
               sys.stdout.flush()

            del self.updates

            if self.historyFile:
               self.historyFile.close()
               self.historyFile = None
               self.saveHistory()
               self.openHistory('a')

            if consecutiveEmptyQueues == self.maximumConsecutiveEmptyQueues:
               self.cleanup()
               log("%s monitor stopped" % (self.siteDesignator))
               sys.exit(0)
         else:
            log("Error %d in %s command:\n%s" % (errStatus,self.qstatCommand,qstatError))


if __name__ == '__main__':

   openLog(LOGPATH)

   log("%s monitor started" % (SITEDESIGNATOR))

   __queueMonitor__ = QueueMonitor(SITEDESIGNATOR,QSTATCOMMAND,HISTORYFILEPATH, \
                                   SLEEPTIME,PAUSETIME,MAXIMUMIDLETIME)

   __queueMonitor__.loadHistory()
   __queueMonitor__.saveHistory()
   __queueMonitor__.monitorQ()


