#!/usr/bin/env python
#
# Copyright (c) 2004-2010 Purdue University All rights reserved.
# 
# Developed by: HUBzero Technology Group, Purdue University
#               http://hubzero.org
# 
# HUBzero is free software: you can redistribute it and/or modify it under the terms of the
# GNU Lesser General Public License as published by the Free Software Foundation, either
# version 3 of the License, or (at your option) any later version.
# 
# HUBzero is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU Lesser General Public License for more details.  You should have received a
# copy of the GNU Lesser General Public License along with HUBzero.
# If not, see <http://www.gnu.org/licenses/>.
# 
# GNU LESSER GENERAL PUBLIC LICENSE
# Version 3, 29 June 2007
# Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
#
# ----------------------------------------------------------------------
#  monitorJob.py
#
#  script which gathers monitoring statistics from remote sites and
#  reports job status to distributor on demand
#
import sys
import os
import os.path
import popen2
import select
import signal
import socket
import time
from errno import EINTR

from LogMessage          import getLogMessageFileNo, openLog, logPID as log
from MonitorsInfo        import MonitorsInfo
from JobMonitor          import JobMonitor
from RemoteTunnelMonitor import RemoteTunnelMonitor

siteMonitorPid          = 0
monitorHost             = ""
monitorPort             = 5727
monitorRoot             = os.path.join(os.sep,'opt','submit')
remoteTunnelMonitorHost = ""
remoteTunnelMonitorPort = 5729

monitorLogLocation = os.path.join(os.sep,'var','log','submit')
logFileName        = "monitorJob.log"
dumpFileName       = "monitorJob.dump"
remoteSiteJobInfo  = {}
remoteSiteNotify   = {}

monitorsInfo       = None
jobMonitor         = None

IDENTITY           = os.path.join(monitorRoot,".ssh","submit_rsa")


def terminate():
   global siteMonitorPid
   global remoteSiteJobInfo
   global jobMonitor

   if siteMonitorPid:
      log("Send TERM to child ssh process")
      os.kill(siteMonitorPid,signal.SIGTERM)
      log("distributor site monitor stopped")
   else:
      jobMonitor.dumpActiveJobs()
      for site in remoteSiteJobInfo:
         jobsSite,inputFile,outputFile,childPid = remoteSiteJobInfo[site]
         log("Send TERM to child site %s process" % (jobsSite))
         os.kill(childPid,signal.SIGTERM)
      log("***********************************")
      log("* distributor job monitor stopped *")
      log("***********************************")


def sigGEN_handler(signalType, frame):
   terminate()
   sys.exit(1)


def sigINT_handler(signal, frame):
   global monitorRoot
   global monitorsInfo

   log("Received SIGINT!")
   del monitorsInfo
   monitorsInfo = MonitorsInfo(monitorRoot,"monitors.dat")
   log("Site Monitor Info Reloaded!")
#  sigGEN_handler(signal, frame)

def sigHUP_handler(signal, frame):
   log("Received SIGHUP!")
   sigGEN_handler(signal, frame)

def sigQUIT_handler(signal, frame):
   log("Received SIGQUIT!")
   sigGEN_handler(signal, frame)

def sigABRT_handler(signal, frame):
   log("Received SIGABRT!")
   sigGEN_handler(signal, frame)

def sigTERM_handler(signal, frame):
   log("Received SIGTERM!")
   sigGEN_handler(signal, frame)

signal.signal(signal.SIGINT, sigINT_handler)
signal.signal(signal.SIGHUP, sigHUP_handler)
signal.signal(signal.SIGQUIT, sigQUIT_handler)
signal.signal(signal.SIGABRT, sigABRT_handler)
signal.signal(signal.SIGTERM, sigTERM_handler)


def getRemoteQueueJobStatus(command,
                            inputFd,
                            outputFile):
   global siteMonitorPid

   child = popen2.Popen3(command,1)
   siteMonitorPid = child.pid
   childout = child.fromchild
   childoutFd = childout.fileno()
   childerr = child.childerr
   childerrFd = childerr.fileno()

   outEOF = errEOF = 0
   BUFSIZ = 4096

   outData = []
   errData = []
   dstData = []

   while 1:
      toCheck = []
      toCheck.append(inputFd)
      if not outEOF:
         toCheck.append(childoutFd)
      if not errEOF:
         toCheck.append(childerrFd)
      ready = select.select(toCheck,[],[]) # wait for input

      if inputFd in ready[0]:
         inChunk = os.read(inputFd,BUFSIZ)
         if inChunk != '':
            child.tochild.write(inChunk)
            child.tochild.flush()

      if childoutFd in ready[0]:
         outChunk = os.read(childoutFd,BUFSIZ)
         if outChunk == '':
            outEOF = 1
         outData.append(outChunk)
         outputFile.write(outChunk)
         outputFile.flush()

      if childerrFd in ready[0]:
         errChunk = os.read(childerrFd,BUFSIZ)
         if errChunk == '':
            errEOF = 1
         errData.append(errChunk)

      if outEOF and errEOF:
         break

   err = child.wait()
   siteMonitorPid = 0
   if err != 0:
      log("%s failed w/ exit code %d" % (command,err))
      log("%s" % ("".join(errData)))

   return  err,"".join(outData),"".join(errData),"\n".join(dstData)


if __name__ == '__main__':

   openLog(os.path.join(monitorRoot,monitorLogLocation,logFileName))

   logMessageFileNo = getLogMessageFileNo()
   if logMessageFileNo != sys.stdout.fileno():
      os.close(sys.stdin.fileno())
      os.close(sys.stdout.fileno())
      os.close(sys.stderr.fileno())
      os.dup2(logMessageFileNo,1)
      os.dup2(logMessageFileNo,2)
      devnull = open("/dev/null","rw")
      os.dup2(sys.stdin.fileno(),devnull.fileno())

   if os.fork() != 0:
      os.wait()
      sys.exit(0)
   else:
      os.setsid()
      pid = os.fork()
      if pid != 0:
         sys.exit(0)

   log("***********************************")
   log("* distributor job monitor started *")
   log("***********************************")

   time.sleep(2)

   monitorsInfo        = MonitorsInfo(monitorRoot,"monitors.dat")
   remoteTunnelMonitor = RemoteTunnelMonitor(remoteTunnelMonitorHost,remoteTunnelMonitorPort)

   jobMonitor = JobMonitor(monitorHost,monitorPort,activeJobDumpPath=os.path.join(monitorRoot,dumpFileName))
   if not jobMonitor.isBound():
      sys.exit(1)

   jobMonitor.loadActiveJobs()

   inputDescriptors  = []
   clientConnections = []

   monitorSocketFd = jobMonitor.boundFileDescriptor()
   inputDescriptors.append(monitorSocketFd)

   while 1:
      newJobSite = ""
      newJobId   = ""
      try:
         readyInputFds = select.select(inputDescriptors,[],[])[0]
      except select.error,err:
         if err[0] == EINTR:
            readyInputFds = []
         else:
            for inputDescriptor in inputDescriptors:
               if isinstance(inputDescriptor,socket.socket):
                  try:
                     os.fstat(inputDescriptor.fileno())
                  except:
                     log(inputDescriptor)
               else:
                  try:
                     os.fstat(inputDescriptor)
                  except:
                     log(inputDescriptor)
            terminate()
            raise

      if monitorSocketFd in readyInputFds:
         channel = jobMonitor.acceptConnection()
         clientConnections.append(channel)
         inputDescriptors.append(channel)

      for readyInputFd in readyInputFds:
         if (readyInputFd != monitorSocketFd) and (readyInputFd not in clientConnections):
            messageSite,inputFile,outputFile,childPid = remoteSiteJobInfo[readyInputFd]
            log("Update message received from %s" % (messageSite))
            updateReceived = False
            message = inputFile.readline()
#           log("message: " + message)

            messages = message.split(":")
            if messages[0] != "":
               messageLength,messageSite = messages[0].split()
               messageLength = int(messageLength)
            else:
               messageLength = 0

            if messageLength == 0:
               os.waitpid(childPid,0)
               jobMonitor.deleteActiveJobSite(messageSite)
               inputDescriptors.remove(readyInputFd)
               del remoteSiteJobInfo[readyInputFd]
               del remoteSiteNotify[messageSite]
               inputFile.close()
               outputFile.close()
               log("Closed %s" % (messageSite))
            else:
               del messages[0]
               if len(messages) != messageLength:
                  log("Incomplete message received from %s" % (messageSite))
               for message in messages:
                  try:
                     localJobId,jobStatus,jobStage = message.split()
                     globalJobId = messageSite + ":" + localJobId
                     jobMonitor.addActiveJob(globalJobId,jobStatus,jobStage)
                  except:
                     pass
               jobMonitor.dumpActiveJobs()
               updateReceived = True
            del messages

            if updateReceived:
               jobMonitor.addActiveJobSite(messageSite,time.time())
               jobMonitor.markNewActiveJobsAsDone(messageSite)

      for channel in readyInputFds:
         if channel in clientConnections:
            channelClosed,newJobSite,newJobId = jobMonitor.processRequest(channel)
            if channelClosed:
               clientConnections.remove(channel)
               inputDescriptors.remove(channel)

            if newJobSite != "":
               if not jobMonitor.isJobSiteActive(newJobSite):
                  sshCommand,tunnelDesignator = monitorsInfo.getSSHCommand(newJobSite,IDENTITY,remoteTunnelMonitor)

                  if sshCommand != "":
                     parentReceiveFd,childSendFd = os.pipe()
                     childReceiveFd,parentSendFd = os.pipe()

                     pid = os.fork()
                     if pid:
# parent
                        try:
                           os.close(childReceiveFd)
                        except:
                           log("close(childReceiveFd) failed")
                        try:
                           os.close(childSendFd)
                        except:
                           log("close(childSendFd) failed")
                        inputFile  = os.fdopen(parentReceiveFd,"r")
                        outputFile = os.fdopen(parentSendFd,"w")
                        inputDescriptors.append(parentReceiveFd)
                        remoteSiteJobInfo[parentReceiveFd] = (newJobSite,inputFile,outputFile,pid)
                        remoteSiteNotify[newJobSite] = (outputFile)
                        jobMonitor.addActiveJobSite(newJobSite,time.time())
                     else:
# child
                        try:
                           os.close(parentReceiveFd)
                        except:
                           log("close(parentReceiveFd) failed")
                        try:
                           os.close(parentSendFd)
                        except:
                           log("close(parentSendFd) failed")
                        log("Launching %s" % (newJobSite))
                        outputFile = os.fdopen(childSendFd,"w")
                        if tunnelDesignator != "":
                           remoteTunnelMonitor.incrementTunnelUse(tunnelDesignator)
                        getRemoteQueueJobStatus(sshCommand,childReceiveFd,outputFile)
                        if tunnelDesignator != "":
                           remoteTunnelMonitor.decrementTunnelUse(tunnelDesignator)
                        outputFile.write("0 %s:\n" % (newJobSite))
                        outputFile.flush()
                        outputFile.close()
                        try:
                           os.close(childReceiveFd)
                        except:
                           log("close(childReceiveFd) failed")
                        log("Closing %s" % (newJobSite))
                        sys.exit(0)

               if newJobId != "":
                  outputFile = remoteSiteNotify[newJobSite]
                  try:
                     outputFile.write(newJobId + "\n")
                     outputFile.flush()
                  except:
                     pass

      time.sleep(2)

