#!/usr/bin/php
<?
# @package      hubzero-metrics
# @file         xlogimport_webhits
# @author       Nicholas J. Kisseberth <nkissebe@purdue.edu>
# @author       Swaroop Samek <swaroop@purdue.edu>
# @copyright    Copyright (c) 2011-2015 HUBzero Foundation, LLC.
# @license      http://opensource.org/licenses/MIT MIT
#
# Copyright (c) 2011-2015 HUBzero Foundation, LLC.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# HUBzero is a registered trademark of HUBzero Foundation, LLC.
#
# =========================================================================
# This Script imports apache logs into the web and webhits tables
#
# USAGE: ./xlogimport_webhits <filename>
#
# =========================================================================

error_reporting(E_ALL & ~E_NOTICE);
@ini_set('display_errors','1');

if(!defined('__DIR__')) {
    $fPos = strrpos(__FILE__, "/");
    define("__DIR__", substr(__FILE__, 0, $fPos) . "/");
}

require_once(__DIR__."/../includes/hub_parameters.php");
require_once(__DIR__."/../includes/db_connect.php");
require_once(__DIR__."/../includes/func_misc.php");

$db_hub = db_connect('db_hub');

$filehandle = fopen($_SERVER['argv'][1], "r");

if (!$filehandle) {
    $msg = "Error opening file: ".$_SERVER['argv'][1]."\n";
	clean_exit($msg);
}

$unrec = '';

# building excluded IP list
$filtered_ips = gen_exclude_list('ip');
# building excluded URL list
$filtered_urls = gen_exclude_list('url');
# building excluded useragent list
$filtered_useragents = gen_exclude_list('useragent');

$log_pattern_old = '/^(\d{4}-\d{2}-\d{2})\s+(\d+:\d{2}:\d{2})\s+([\w\-\d]+)\s+(\S+)\s+\"(.+)\"\s+([\-\d]+)\s+([\d]+)\s+([\w\-\.\d]+)\s+\"(.*)\"\s+\"(.*)\"\s+([\w\-\.\d]+)\s+([\w\-\d]+)\s+([\w\-\d]+)\s+(.*)$/';

$log_pattern_new = '/^(\d{4}-\d{2}-\d{2})\s+(\d+:\d{2}:\d{2})\s+([\w\-\d]+)\s+([\d]+)\s+(\S+)\s+\"(.+)\"\s+([\-\d]+)\s+([\d]+)\s+([\w\-\.\d]+)\s+\"(.*)\"\s+\"(.*)\"\s+([\w\-\.\d]+)\s+([\w\-\d]+)\s+([\w\-\d]+)\s+([\-\d]+)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s*$/';

$debug     = 0;
$prevdatestamp = '';
$hits      = 0;

function update_webhits($db_hub, $datestamp, $hits)
{
	global $metrics_db;

	$sql_ins = 'INSERT INTO '.$metrics_db.'.webhits (datetime, hits) VALUES(' . dbquote($datestamp) . ', ' . dbquote($hits) . ')';
	$result = mysql_exec($db_hub, $sql_ins);
}
	
while(1)
{
	$line = fgets($filehandle);

   	if (feof($filehandle))
   		break;

   	if (preg_match($log_pattern_new, $line, $matches)) {

		$datestamp = $matches[1];
    	$timestamp = $matches[2];
 	    $timezone  = $matches[3];
		$pid       = $matches[4];
	    $user      = $matches[5];
	    $firstline = $matches[6];
	    $return    = $matches[7];
	    $bytes     = $matches[8];
	    $ip		   = $matches[9];
	    $referrer  = $matches[10];
	    $useragent = $matches[11];
	    $sslport   = $matches[12];
	    $ts        = $matches[13];
	    $tms       = $matches[14];
	    $uidNumber = $matches[15];
	    $joomla_id = $matches[16];
	    $st_cookie = $matches[17];
	    $auth_type = $matches[18];
	    $comp_name = $matches[19];
	    $view_name = $matches[20];
	    $task_name = $matches[21];
	    $actn_name = $matches[22];
	    $item_name = $matches[23];

	} else if (preg_match($log_pattern_old, $line, $matches)) {

    	$datestamp = $matches[1];
		$timestamp = $matches[2];
		$timezone  = $matches[3];
    	$pid       = '';
		$user      = $matches[4];
		$firstline = $matches[5];
		$return    = $matches[6];
		$bytes     = $matches[7];
		$ip        = $matches[8];
		$referrer  = $matches[9];
		$useragent = $matches[10];
		$sslport   = $matches[11];
		$ts        = $matches[12];
		$tms       = $matches[13];
    	$uidNumber = '';
    	$joomla_id = '';
		$st_cookie = $matches[14];
    	$auth_type = '';
    	$comp_name = '';
    	$view_name = '';
    	$task_name = '';
    	$actn_name = '';
    	$item_name = '';

	} else {

		$unrec .= 'Unrecognized log format: '.$line;
		continue;

	}

	@list($method, $url, $protocol) = preg_split("/[ ]+/", $firstline);

	if (empty($url))
	{
		$url = $method;
		$method = 'GET';
		$protocol = 'HTTP/1.1';
	}
	else if (empty($protocol))
		$protocol = 'HTTP/1.1';
     
	$url = preg_replace('/\/+/','/',$url); // collapse multiple / to single /

	if ($return == 200 && $bytes > 0 && (!search_array($ip, $filtered_ips)) && (!search_array($useragent, $filtered_useragents)) && (!search_array($url, $filtered_urls)) && ($method == "GET" || $method == "POST") )
	{
		$hits++;
	
		# Insert total hit-count for previous day into database...
		if ($prevdatestamp != $datestamp)
		{
			if (!empty($prevdatestamp))
				update_webhits($db_hub, $prevdatestamp, $hits-1);

			$prevdatestamp = $datestamp;
			$hits = 1;
		}
	}
}

# Insert total hit-count for final day into database...
update_webhits($db_hub, $prevdatestamp, $hits);

if($unrec)
	print $unrec;

fclose($filehandle);
db_close($db_hub);

?>
