#!/usr/bin/env python
#
# @package      hubzero-submit-distributor
# @file         monitorJob.py
# @author       Steven Clark <clarks@purdue.edu>
# @copyright    Copyright (c) 2004-2012 HUBzero Foundation, LLC.
# @license      http://www.gnu.org/licenses/lgpl-3.0.html LGPLv3
#
# Copyright (c) 2004-2012 HUBzero Foundation, LLC.
#
# This file is part of: The HUBzero(R) Platform for Scientific Collaboration
#
# The HUBzero(R) Platform for Scientific Collaboration (HUBzero) is free
# software: you can redistribute it and/or modify it under the terms of
# the GNU Lesser General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# HUBzero is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# HUBzero is a registered trademark of HUBzero Foundation, LLC.
#
# ----------------------------------------------------------------------
#  monitorJob.py
#
#  script which gathers monitoring statistics from remote sites and
#  reports job status to distributor on demand
#
import sys
import os
import subprocess
import select
import signal
import socket
import time
from errno import EINTR

from hubzero.submit.LogMessage          import getLogMessageFileNo, openLog, logPID as log
from hubzero.submit.MonitorsInfo        import MonitorsInfo
from hubzero.submit.JobMonitor          import JobMonitor
from hubzero.submit.RemoteTunnelMonitor import RemoteTunnelMonitor

JOBMONITORHOST    = ""
JOBMONITORPORT    = 5727
TUNNELMONITORHOST = ""
TUNNELMONITORPORT = 5729

MONITORROOT          = os.path.join(os.sep,'opt','submit')
MONITORLOGLOCATION   = os.path.join(os.sep,'var','log','submit','monitors')
MONITORLOGFILENAME   = "monitorJob.log"
LOGPATH              = os.path.join(MONITORLOGLOCATION,MONITORLOGFILENAME)
DBFILENAME           = "monitorJobDB"
DBFILEPATH           = os.path.join(MONITORROOT,DBFILENAME)
DUMPFILENAME         = "monitorJob.dump"
DUMPFILEPATH         = os.path.join(MONITORROOT,DUMPFILENAME)
MONITORSINFOLOCATION = MONITORROOT
MONITORSINFOFILENAME = "monitors.dat"
SSHIDENTITY          = os.path.join(MONITORROOT,".ssh","submit_rsa")
BINDIRECTORY         = os.path.join(MONITORROOT,'bin')


class MonitorJob:
   def __init__(self,
                jobMonitorHost,
                jobMonitorPort,
                tunnelMonitorHost,
                tunnelMonitorPort,
                monitorsInfoLocation,
                monitorsInfoFilename,
                dbFilePath,
                dumpFilePath,
                sshIdentity):
      self.jobMonitorHost       = jobMonitorHost
      self.jobMonitorPort       = jobMonitorPort
      self.tunnelMonitorHost    = tunnelMonitorHost
      self.tunnelMonitorPort    = tunnelMonitorPort
      self.monitorsInfoLocation = monitorsInfoLocation
      self.monitorsInfoFilename = monitorsInfoFilename
      self.dbFilePath           = dbFilePath
      self.dumpFilePath         = dumpFilePath
      self.sshIdentity          = sshIdentity

      self.bufferSize        = 4096
      self.siteMonitorPid    = 0
      self.remoteSiteJobInfo = {}
      self.remoteSiteNotify  = {}
      self.inputDescriptors  = []
      self.clientConnections = []

      self.monitorsInfo        = None
      self.jobMonitor          = None
      self.remoteTunnelMonitor = None

      self.reportedJobDoneAgeLimit = 60*60*3
      self.jobDoneAgeLimit         = 60*60*6
      self.purgeJobInterval        = 60*60

      signal.signal(signal.SIGINT,self.sigINT_handler)
      signal.signal(signal.SIGHUP,self.sigHUP_handler)
      signal.signal(signal.SIGQUIT,self.sigQUIT_handler)
      signal.signal(signal.SIGABRT,self.sigABRT_handler)
      signal.signal(signal.SIGTERM,self.sigTERM_handler)


   def terminate(self):
      if self.siteMonitorPid:
         log("Send TERM to child ssh process")
         os.kill(self.siteMonitorPid,signal.SIGTERM)
         log("distributor site monitor stopped")
      else:
         self.jobMonitor.dumpActiveJobs()
         self.jobMonitor.close()
         for site in self.remoteSiteJobInfo:
            jobsSite,inputFile,outputFile,childPid = self.remoteSiteJobInfo[site]
            log("Send TERM to child site %s process" % (jobsSite))
            os.kill(childPid,signal.SIGTERM)
         log("***********************************")
         log("* distributor job monitor stopped *")
         log("***********************************")


   def sigGEN_handler(self,
                      signalNumber,
                      frame):
      self.terminate()
      sys.exit(1)

   
   def sigINT_handler(self,
                      signalNumber,
                      frame):
      log("Received SIGINT!")
      del self.monitorsInfo
      self.monitorsInfo = MonitorsInfo(self.monitorsInfoLocation,self.monitorsInfoFilename)
      log("Site Monitor Info Reloaded!")
   
   
   def sigHUP_handler(self,
                      signalNumber,
                      frame):
      log("Received SIGHUP!")
      self.sigGEN_handler(signalNumber,frame)
   
   
   def sigQUIT_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGQUIT!")
      self.sigGEN_handler(signalNumber,frame)
   
   
   def sigABRT_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGABRT!")
      self.sigGEN_handler(signalNumber,frame)
   
   
   def sigTERM_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGTERM!")
      self.sigGEN_handler(signalNumber,frame)
   

   def getRemoteQueueJobStatus(self,
                               command,
                               inputFd,
                               outputFile):
      child = subprocess.Popen(command,shell=True,bufsize=self.bufferSize,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               close_fds=True)
      self.siteMonitorPid = child.pid
      childin    = child.stdin
      childout   = child.stdout
      childoutFd = childout.fileno()
      childerr   = child.stderr
      childerrFd = childerr.fileno()
   
      outEOF = errEOF = 0
   
      outData = []
      errData = []
      dstData = []
   
      while 1:
         if os.getppid() == 1:
            os.kill(os.getpid(),signal.SIGTERM)
         toCheck = []
         toCheck.append(inputFd)
         if not outEOF:
            toCheck.append(childoutFd)
         if not errEOF:
            toCheck.append(childerrFd)
         ready = select.select(toCheck,[],[]) # wait for input
   
         if inputFd in ready[0]:
            inChunk = os.read(inputFd,self.bufferSize)
            if inChunk != '':
               childin.write(inChunk)
               childin.flush()
   
         if childoutFd in ready[0]:
            outChunk = os.read(childoutFd,self.bufferSize)
            if outChunk == '':
               outEOF = 1
            outData.append(outChunk)
            outputFile.write(outChunk)
            outputFile.flush()
   
         if childerrFd in ready[0]:
            errChunk = os.read(childerrFd,self.bufferSize)
            if errChunk == '':
               errEOF = 1
            errData.append(errChunk)
   
         if outEOF and errEOF:
            break
   
      err = child.wait()
      self.siteMonitorPid = 0
      if err != 0:
         log("%s failed w/ exit code %d" % (command,err))
         log("%s" % ("".join(errData)))
   
      return(err,"".join(outData),"".join(errData),"\n".join(dstData))
   

   def startNewRemoteMonitor(self,
                             newJobSite):
      sshCommand,tunnelDesignator = self.monitorsInfo.getSSHCommand(newJobSite,self.sshIdentity,self.remoteTunnelMonitor)
      if sshCommand != "":
         parentReceiveFd,childSendFd = os.pipe()
         childReceiveFd,parentSendFd = os.pipe()
   
         pid = os.fork()
         if pid:
   # parent
            try:
               os.close(childReceiveFd)
            except:
               log("close(childReceiveFd) failed")
            try:
               os.close(childSendFd)
            except:
               log("close(childSendFd) failed")
            inputFile  = os.fdopen(parentReceiveFd,'r')
            outputFile = os.fdopen(parentSendFd,'w')
            self.inputDescriptors.append(parentReceiveFd)
            self.remoteSiteJobInfo[parentReceiveFd] = (newJobSite,inputFile,outputFile,pid)
            self.remoteSiteNotify[newJobSite] = (outputFile)
            self.jobMonitor.addActiveJobSite(newJobSite,time.time())
         else:
   # child
            try:
               os.close(parentReceiveFd)
            except:
               log("close(parentReceiveFd) failed")
            try:
               os.close(parentSendFd)
            except:
               log("close(parentSendFd) failed")
            log("Launching %s" % (newJobSite))
            outputFile = os.fdopen(childSendFd,'w')
            if tunnelDesignator != "":
               self.remoteTunnelMonitor.incrementTunnelUse(tunnelDesignator)
            self.getRemoteQueueJobStatus(sshCommand,childReceiveFd,outputFile)
            if tunnelDesignator != "":
               self.remoteTunnelMonitor.decrementTunnelUse(tunnelDesignator)
            outputFile.write("0 %s:\n" % (newJobSite))
            outputFile.flush()
            outputFile.close()
            try:
               os.close(childReceiveFd)
            except:
               log("close(childReceiveFd) failed")
            log("Closing %s" % (newJobSite))
            sys.exit(0)
   

   def monitor(self):
      self.monitorsInfo        = MonitorsInfo(self.monitorsInfoLocation,self.monitorsInfoFilename)
      self.remoteTunnelMonitor = RemoteTunnelMonitor(self.tunnelMonitorHost,self.tunnelMonitorPort)
   
      self.jobMonitor = JobMonitor(self.jobMonitorHost,self.jobMonitorPort, \
                                   activeJobDBPath=self.dbFilePath, \
                                   activeJobDumpPath=self.dumpFilePath)
      if not self.jobMonitor.isBound():
         sys.exit(1)
   
      log("***********************************")
      log("* distributor job monitor started *")
      log("***********************************")
   
      self.jobMonitor.loadActiveJobs()
   
      monitorSocketFd = self.jobMonitor.boundFileDescriptor()
      self.inputDescriptors.append(monitorSocketFd)
   
      timeLastPurgeReportedJobDone = 0
      timeLastPurgeJobDone         = 0
      lastReportedActiveJobCount   = 0
   
      while 1:
         nPurgeActiveJobs = 0
         now = time.time()
         if now-timeLastPurgeReportedJobDone > self.purgeJobInterval:
            nPurgeActiveJobs += self.jobMonitor.purgeActiveJobs('Dr',self.reportedJobDoneAgeLimit)
            timeLastPurgeReportedJobDone = now
         if now-timeLastPurgeJobDone > self.purgeJobInterval:
            nPurgeActiveJobs += self.jobMonitor.purgeActiveJobs('D',self.jobDoneAgeLimit)
            timeLastPurgeJobDone = now
         if nPurgeActiveJobs > 0:
            log("%d jobs purged" % (nPurgeActiveJobs))
            self.jobMonitor.dumpActiveJobs()
         activeJobCount = self.jobMonitor.getActiveJobCount()
         if activeJobCount != lastReportedActiveJobCount:
            log("%d monitored jobs" % (activeJobCount))
         lastReportedActiveJobCount = activeJobCount
   
         newJobSite = ""
         newJobId   = ""
         try:
            readyInputFds = select.select(self.inputDescriptors,[],[])[0]
         except select.error,err:
            if err[0] == EINTR:
               readyInputFds = []
            else:
               for inputDescriptor in self.inputDescriptors:
                  if isinstance(inputDescriptor,socket.socket):
                     try:
                        os.fstat(inputDescriptor.fileno())
                     except:
                        log(inputDescriptor)
                  else:
                     try:
                        os.fstat(inputDescriptor)
                     except:
                        log(inputDescriptor)
               self.terminate()
               raise
   
         if monitorSocketFd in readyInputFds:
            channel = self.jobMonitor.acceptConnection()
            self.clientConnections.append(channel)
            self.inputDescriptors.append(channel)
   
         for readyInputFd in readyInputFds:
            if (readyInputFd != monitorSocketFd) and (readyInputFd not in self.clientConnections):
               messageSite,inputFile,outputFile,childPid = self.remoteSiteJobInfo[readyInputFd]
               log("Update message received from %s" % (messageSite))
               updateReceived = False
               message = inputFile.readline()
#              log("message: " + message)
   
               messages = message.split(':')
               if messages[0] != "":
                  messageLength,messageSite = messages[0].split()
                  messageLength = int(messageLength)
               else:
                  messageLength = 0
   
               if messageLength == 0:
                  os.waitpid(childPid,0)
                  self.jobMonitor.deleteActiveJobSite(messageSite)
                  self.inputDescriptors.remove(readyInputFd)
                  del self.remoteSiteJobInfo[readyInputFd]
                  del self.remoteSiteNotify[messageSite]
                  inputFile.close()
                  outputFile.close()
                  log("Closed %s" % (messageSite))
               else:
                  del messages[0]
                  if len(messages) != messageLength:
                     log("Incomplete message received from %s" % (messageSite))
                  for message in messages:
                     try:
                        localJobId,jobStatus,jobStage,jobQueue = (message.split() + ['?'])[:4]
                        globalJobId = messageSite + ':' + localJobId
                        self.jobMonitor.updateActiveJob(globalJobId,jobStatus,jobStage,jobQueue)
                     except:
                        pass
                  self.jobMonitor.dumpActiveJobs()
                  updateReceived = True
               del messages
   
               if updateReceived:
                  self.jobMonitor.addActiveJobSite(messageSite,time.time())
#           the new submission message should be resent?
#                 self.jobMonitor.markNewActiveJobsAsDone(messageSite)
   
         for channel in readyInputFds:
            if channel in self.clientConnections:
               channelClosed,newJobSite,newJobId = self.jobMonitor.processRequest(channel)
               if channelClosed:
                  self.clientConnections.remove(channel)
                  self.inputDescriptors.remove(channel)
   
               if newJobSite != "":
                  if not self.jobMonitor.isJobSiteActive(newJobSite):
                     self.startNewRemoteMonitor(newJobSite)
   
                  if newJobId != "":
                     try:
                        outputFile = self.remoteSiteNotify[newJobSite]
                        outputFile.write(newJobId + "\n")
                        outputFile.flush()
                     except:
                        pass
   
   
def daemonize():
   logMessageFileNo = getLogMessageFileNo()
   if logMessageFileNo != sys.stdout.fileno():
      os.close(sys.stdin.fileno())
      os.close(sys.stdout.fileno())
      os.close(sys.stderr.fileno())
      os.dup2(logMessageFileNo,1)
      os.dup2(logMessageFileNo,2)
      devnull = open("/dev/null",'rw')
      os.dup2(sys.stdin.fileno(),devnull.fileno())

   if os.fork() != 0:
      os.wait()
      sys.exit(0)
   else:
      os.setsid()
      pid = os.fork()
      if pid != 0:
         sys.exit(0)

   time.sleep(1)


if __name__ == '__main__':

   openLog(LOGPATH)
   daemonize()

   os.environ['PATH'] = BINDIRECTORY + ':' + os.environ['PATH']

   __monitorJob__ = MonitorJob(JOBMONITORHOST,JOBMONITORPORT, \
                               TUNNELMONITORHOST,TUNNELMONITORPORT, \
                               MONITORSINFOLOCATION,MONITORSINFOFILENAME, \
                               DBFILEPATH,DUMPFILEPATH, \
                               SSHIDENTITY)
   __monitorJob__.monitor()


