#!/usr/bin/env python
#
# @package      hubzero-submit-monitors
# @file         monitorJobSQL.py
# @copyright    Copyright (c) 2004-2020 The Regents of the University of California.
# @license      http://opensource.org/licenses/MIT MIT
#
# Copyright (c) 2004-2020 The Regents of the University of California.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# HUBzero is a registered trademark of The Regents of the University of California.
#
# ----------------------------------------------------------------------
#  monitorJob.py
#
#  script which gathers monitoring statistics from remote sites and
#  reports job status to distributor on demand
#
import sys
import os
import subprocess
import select
import signal
import time
import logging
from errno import EINTR

from hubzero.submit.LogMessage    import getLogPIDMessage as getLogMessage
from hubzero.submit.DaemonsInfo   import DaemonsInfo
from hubzero.submit.JobMonitorSQL import JobMonitorSQL as JobMonitor

CONFIGURATIONDIRECTORY   = os.path.join(os.sep,'etc','submit')
MONITORCONFIGURATIONFILE = 'jobmonitor.conf'
DAEMONSCONFIGURATIONFILE = 'daemons.conf'
INFOSCONFIGURATIONFILE   = 'infos.conf'

MONITORROOT     = os.path.join(os.sep,'opt','submit')
MONITORSITEFILE = "monitorJobSiteSQL.py"
MONITORSITEPATH = os.path.join(MONITORROOT,MONITORSITEFILE)

MONITORLOGLOCATION = os.path.join(os.sep,'var','log','submit','monitors')
MONITORLOGFILENAME = "monitorJobSQL.log"
APPLICATIONLOGGER  = logging.getLogger('')

TARGETSCHEMA = 5

ACTIVITYUPDATEINTERVAL = 15.*60.

def openLogger(logDirectory,
               hubLogFile):
   class EmptyFilter(logging.Filter):
      """
      This is a filter which rejects empty messages

      """

      def filter(self,record):
         if record.getMessage() == "":
            emptyRecord = True
         else:
            emptyRecord = False

         return(not emptyRecord)

   APPLICATIONLOGGER.setLevel(logging.DEBUG)

   hubLogPath = os.path.join(logDirectory,hubLogFile)
   logHandler = logging.FileHandler(hubLogPath)
   fdLogFile = logHandler.stream.fileno()

   emptyFilter = EmptyFilter()
   logHandler.addFilter(emptyFilter)

   logFormatter = logging.Formatter('%(asctime)s %(message)s','%s [%a %b %d %H:%M:%S %Y]')
   logHandler.setFormatter(logFormatter)
   APPLICATIONLOGGER.addHandler(logHandler)

   return(fdLogFile)


def daemonize(fdLogFile):
   if fdLogFile != sys.stdout.fileno():
      try:
         devnull = open("/dev/null",'r')
         try:
            os.dup2(devnull.fileno(),sys.stdin.fileno())
            os.dup2(fdLogFile,sys.stdout.fileno())
            os.dup2(fdLogFile,sys.stderr.fileno())
         except OSError:
            APPLICATIONLOGGER.log(logging.ERROR,getLogMessage("file descriptor dup failed"))
      except (IOError,OSError):
         APPLICATIONLOGGER.log(logging.ERROR,getLogMessage("%s could not be opened" % ("/dev/null")))

   if os.fork() != 0:
      os.wait()
      sys.exit(0)
   else:
      os.setsid()
      os.chdir("/")
      pid = os.fork()
      if pid != 0:
         sys.exit(0)

   time.sleep(2)


class MonitorJob:
   def __init__(self,
                configurationDirectory,
                monitorConfigurationFile,
                daemonsConfigurationFile,
                infosConfigurationFile,
                monitorSitePath):
      self.logger          = logging.getLogger(__name__)
      self.monitorSitePath = monitorSitePath

      self.monitorType = 'Master'
      self.bufferSize  = 4096

      configFilePath = os.path.join(configurationDirectory,daemonsConfigurationFile)
      daemonsInfo    = DaemonsInfo(configFilePath)
      self.listenURI = daemonsInfo.getDaemonListenURI('jobMonitor','tcp')

      self.killOrphanJobSiteMonitors()

      self.jobMonitor = JobMonitor(configurationDirectory,
                                   monitorConfigurationFile,
                                   infosConfigurationFile,
                                   self.listenURI)

      self.jobMonitorSetInfoError = False
      self.resetInfo = False
      if self.jobMonitor.configure():
         if self.jobMonitor.updateSchema(TARGETSCHEMA):
            nActiveJobCount = self.jobMonitor.getActiveJobCount()
            self.logger.log(logging.INFO,getLogMessage("%d jobs loaded from database" % (nActiveJobCount)))
            self.jobMonitor.killJobSiteMonitors()
            self.jobMonitorSetInfoError = self.jobMonitor.setInfo()
            if not self.jobMonitorSetInfoError:
               activeSiteDesignators = self.jobMonitor.getActiveSiteDesignators()
               for siteDesignator in activeSiteDesignators:
                  if not self.jobMonitor.isJobSiteActive(siteDesignator):
                     if not self.jobMonitor.isJobSitePending(siteDesignator):
                        self.jobMonitor.addPendingJobSite(siteDesignator)
                        self.startNewRemoteMonitor(siteDesignator)
            self.terminateJobMonitor = False
         else:
            self.terminateJobMonitor = True
      else:
         self.terminateJobMonitor = True

      self.reportedJobDoneAgeLimit = 60*60*3
      self.jobDoneAgeLimit         = 60*60*6
      self.purgeJobInterval        = 60*60

      signal.signal(signal.SIGINT,self.sigINT_handler)
      signal.signal(signal.SIGHUP,self.sigHUP_handler)
      signal.signal(signal.SIGQUIT,self.sigQUIT_handler)
      signal.signal(signal.SIGABRT,self.sigABRT_handler)
      signal.signal(signal.SIGTERM,self.sigTERM_handler)


   def terminate(self):
      self.terminateJobMonitor = True


   def sigGEN_handler(self,
                      signalNumber,
                      frame):
      self.terminate()


   def sigINT_handler(self,
                      signalNumber,
                      frame):
      self.logger.log(logging.INFO,getLogMessage("Received SIGINT!"))
      self.resetInfo = True


   def sigHUP_handler(self,
                      signalNumber,
                      frame):
      self.logger.log(logging.INFO,getLogMessage("Received SIGHUP!"))
      self.sigGEN_handler(signalNumber,frame)


   def sigQUIT_handler(self,
                       signalNumber,
                       frame):
      self.logger.log(logging.INFO,getLogMessage("Received SIGQUIT!"))
      self.sigGEN_handler(signalNumber,frame)


   def sigABRT_handler(self,
                       signalNumber,
                       frame):
      self.logger.log(logging.INFO,getLogMessage("Received SIGABRT!"))
      self.sigGEN_handler(signalNumber,frame)


   def sigTERM_handler(self,
                       signalNumber,
                       frame):
      self.logger.log(logging.INFO,getLogMessage("Received SIGTERM!"))
      self.sigGEN_handler(signalNumber,frame)


   def executeCommand(self,
                      commandArgs):
      outData = []
      errData = []
      try:
         child = subprocess.Popen(commandArgs,bufsize=self.bufferSize,
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  close_fds=True)
      except OSError,err:
         self.logger.log(logging.ERROR,getLogMessage("Command: %s\nfailed: %s." % (commandArgs,err.args[1])))
         exitStatus = err.args[0]
      else:
         childPid   = child.pid
         childout   = child.stdout
         childoutFd = childout.fileno()
         childerr   = child.stderr
         childerrFd = childerr.fileno()

         outEOF = False
         errEOF = False

         while True:
            toCheck = []
            if not outEOF:
               toCheck.append(childoutFd)
            if not errEOF:
               toCheck.append(childerrFd)
            ready = select.select(toCheck,[],[]) # wait for input

            if childoutFd in ready[0]:
               outChunk = os.read(childoutFd,self.bufferSize)
               if outChunk == '':
                  outEOF = True
               outData.append(outChunk)

            if childerrFd in ready[0]:
               errChunk = os.read(childerrFd,self.bufferSize)
               if errChunk == '':
                  errEOF = True
               errData.append(errChunk)

            if outEOF and errEOF:
               break

         pid,exitStatus = os.waitpid(childPid,0)
         if exitStatus != 0:
            if   os.WIFSIGNALED(exitStatus):
               self.logger.log(logging.INFO,getLogMessage("%s failed w/ signal %d" % (commandArgs,os.WTERMSIG(exitStatus))))
            else:
               if os.WIFEXITED(exitStatus):
                  exitStatus = os.WEXITSTATUS(exitStatus)
               self.logger.log(logging.INFO,getLogMessage("%s failed w/ exit code %d" % (commandArgs,exitStatus)))
            self.logger.log(logging.INFO,getLogMessage("%s" % ("".join(errData))))

      return(exitStatus,"".join(outData),"".join(errData))


   def startNewRemoteMonitor(self,
                             siteDesignator):
      self.logger.log(logging.DEBUG,getLogMessage("Start %s jobSite monitor." % (siteDesignator)))
      commandArgs = [self.monitorSitePath,siteDesignator]
      exitStatus,stdOutput,stdError = self.executeCommand(commandArgs)


   def killOrphanJobSiteMonitors(self):
      commandArgs = ['pgrep','-f',self.monitorSitePath]
      exitStatus,stdOutput,stdError = self.executeCommand(commandArgs)
      if not exitStatus:
         self.logger.log(logging.DEBUG,getLogMessage("Kill jobSite monitors."))
         commandArgs = ['pkill','-TERM','-f',self.monitorSitePath]
         exitStatus,stdOutput,stdError = self.executeCommand(commandArgs)
         time.sleep(2)

         commandArgs = ['pgrep','-f',self.monitorSitePath]
         exitStatus,stdOutput,stdError = self.executeCommand(commandArgs)
         if not exitStatus:
            commandArgs = ['pkill','-KILL','-f',self.monitorSitePath]
            exitStatus,stdOutput,stdError = self.executeCommand(commandArgs)


   def monitor(self):
      if not self.jobMonitor.isListening():
         self.logger.log(logging.ERROR,getLogMessage("Port binding failed"))
         sys.exit(1)

      self.logger.log(logging.INFO,getLogMessage("***********************************"))
      self.logger.log(logging.INFO,getLogMessage("* distributor job monitor started *"))
      self.logger.log(logging.INFO,getLogMessage("***********************************"))

      if self.terminateJobMonitor:
         self.jobMonitor.terminate()
      if self.jobMonitorSetInfoError:
         self.jobMonitor.terminate()

      timeLastPurgeReportedJobDone = 0
      timeLastPurgeJobDone         = 0
      lastReportedActiveJobCount   = 0

      timeLastJobRelease    = 0
      timeBetweenJobRelease = 30

      while True:
         if self.monitorType == 'Master':
            nPurgeActiveJobs = 0
            now = time.time()
            if now-timeLastPurgeReportedJobDone > self.purgeJobInterval:
               nPurgeActiveJobs += self.jobMonitor.purgeActiveJobs('Dr',self.reportedJobDoneAgeLimit)
               timeLastPurgeReportedJobDone = now
            if now-timeLastPurgeJobDone > self.purgeJobInterval:
               nPurgeActiveJobs += self.jobMonitor.purgeActiveJobs('D',self.jobDoneAgeLimit)
               timeLastPurgeJobDone = now
            if nPurgeActiveJobs > 0:
               self.logger.log(logging.INFO,getLogMessage("%d jobs purged" % (nPurgeActiveJobs)))
               self.jobMonitor.dumpActiveJobs()
            activeJobCount = self.jobMonitor.getActiveJobCount()
            if activeJobCount != lastReportedActiveJobCount:
               self.logger.log(logging.INFO,getLogMessage("%d monitored jobs" % (activeJobCount)))
            lastReportedActiveJobCount = activeJobCount

            nPurgeFileTails = self.jobMonitor.purgeFileTails()
            if nPurgeFileTails > 0:
               self.logger.log(logging.INFO,getLogMessage("%d file tails purged" % (nPurgeFileTails)))

            self.jobMonitor.updateUserActivityScores()
            self.jobMonitor.purgeUserActivityScores()

            if now-timeLastJobRelease > timeBetweenJobRelease:
               nReleasedJobs = self.jobMonitor.releaseRegisteredJobs()
               timeLastJobRelease = now
               if nReleasedJobs > 0:
                  self.logger.log(logging.INFO,getLogMessage("%d jobs released" % (nReleasedJobs)))

         listeningSocket,activeReaders = self.jobMonitor.getInputObjects()
         activeWriters                 = self.jobMonitor.getOutputObjects()
         if not listeningSocket and not activeReaders and not activeWriters:
            if self.monitorType == 'Master':
               self.jobMonitor.dumpActiveJobs()
               self.logger.log(logging.INFO,getLogMessage("***********************************"))
               self.logger.log(logging.INFO,getLogMessage("* distributor job monitor stopped *"))
               self.logger.log(logging.INFO,getLogMessage("***********************************"))
            break

         try:
            readyReaders,readyWriters,readyExceptions = select.select(listeningSocket+activeReaders,
                                                                      activeWriters,
                                                                      [],
                                                                      ACTIVITYUPDATEINTERVAL)
         except select.error,err:
            if err[0] == EINTR:
               readyReaders = []
               readyWriters = []
            else:
               self.terminate()

         for readyReader in readyReaders:
            if   readyReader in listeningSocket:
               if not self.jobMonitor.acceptConnection(readyReader):
                  self.logger.log(logging.ERROR,getLogMessage("Connection failed."))
               else:
                  # Do a double-fork to dissociate from the listening server.
                  try:
                     fork1PID = os.fork()
                  except OSError,err:
                     self.logger.log(logging.ERROR,getLogMessage("Daemonizing fork1 failed: %s." % (err.args[1])))
                     self.exit(err.args[0])
                  else:
                     if fork1PID != 0:
                        self.jobMonitor.closeConnection()  # Close the client socket in the listening server
                        os.wait()                          # Wait for the intermediate child to exit
                     else:
                        try:
                           fork2PID = os.fork()
                        except OSError,err:
                           self.logger.log(logging.ERROR,getLogMessage("Daemonizing fork2 failed: %s." % (err.args[1])))
                           self.exit(err.args[0])
                        else:
                           if fork2PID != 0:
                              sys.exit(0) # This is the intermediate child.  Exit.
                           else:
                              # This is the real child.
                              os.setsid()
                              self.monitorType = 'RequestProcessor'
                              if self.jobMonitor.acceptHandshake(readyReader):
                                 self.jobMonitor.closeListeningConnections()
                              else:
                                 self.logger.log(logging.ERROR,getLogMessage("Connection acceptHandshake failed."))
            elif readyReader in activeReaders:
               self.jobMonitor.receiveMessage()

         self.jobMonitor.processRequests()

         if self.monitorType == 'Master':
            pendingJobPostingSites = self.jobMonitor.getPendingJobPostingSites()
            for siteDesignator in pendingJobPostingSites:
               if not self.jobMonitor.isJobSiteActive(siteDesignator):
                  if not self.jobMonitor.isJobSitePending(siteDesignator):
                     self.jobMonitor.addPendingJobSite(siteDesignator)
                     self.startNewRemoteMonitor(siteDesignator)

            restartJobSites = self.jobMonitor.getRestartJobSites()
            for siteDesignator in restartJobSites:
               if not self.jobMonitor.isJobSiteActive(siteDesignator):
                  if not self.jobMonitor.isJobSitePending(siteDesignator):
                     self.jobMonitor.addPendingJobSite(siteDesignator)
                     self.startNewRemoteMonitor(siteDesignator)
               self.jobMonitor.deleteRestartJobSite(siteDesignator)

         for readyWriter in readyWriters:
            if readyWriter in activeWriters:
               self.jobMonitor.sendMessage()

         if self.monitorType == 'Master':
            if self.terminateJobMonitor:
               self.jobMonitor.terminate()

            if self.resetInfo:
               self.jobMonitorSetInfoError = self.jobMonitor.setInfo()
               self.resetInfo = False
               if self.jobMonitorSetInfoError:
                  self.jobMonitor.terminate()


if __name__ == '__main__':

   fdLogFile = openLogger(MONITORLOGLOCATION,MONITORLOGFILENAME)
   daemonize(fdLogFile)

   __monitorJob__ = MonitorJob(CONFIGURATIONDIRECTORY,MONITORCONFIGURATIONFILE,
                               DAEMONSCONFIGURATIONFILE,INFOSCONFIGURATIONFILE,
                               MONITORSITEPATH)
   __monitorJob__.monitor()


