#!/bin/sh
# PCP QA Test No. 1201
# pmlogger_check for primary logger and pmcd may not be running (yet)
# - systemd and package install failure case
#
# This is the multiple pmlogger version.  See qa/1200 for the single
# pmlogger version.
#
# Copyright (c) 2020 Ken McDonell.  All Rights Reserved.
#

seq=`basename $0`
echo "QA output created by $seq"

# get standard environment, filters and checks
. ./common.product
. ./common.filter
. ./common.check

_cleanup()
{
    cd $here
    if $_needclean
    then
	[ -f ${PCP_PMLOGGERCONTROL_PATH}.d/$remote ] \
	    && $sudo rm -f ${PCP_PMLOGGERCONTROL_PATH}.d/$remote
	export PMLOGGER_CHECK_SKIP_JANITOR=no
	_service pcp restart 2>&1 | _filter_pcp_start
	_wait_for_pmcd
	_wait_for_pmlogger
	_needclean=false
    fi
    $sudo rm -rf $tmp $tmp.*
}

# borrowed from _wait_for_pmlogger
#
_my_wait_for_pmlogger()
{
    # 6 seconds default seems like a reasonable max time to get going
    _maxdelay=6
    _dir_hostname=`hostname || echo localhost`
    _logfile="$PCP_ARCHIVE_DIR/$_dir_hostname/pmlogger.log" 

    _i=0
    _dead=true
    while [ $_i -lt $_maxdelay ]
    do
	if $sudo -u pcp pmlc -P </dev/null 2>&1 \
		| tee $tmp.err \
		| grep -E "Connection refused|Transport endpoint is not connected" >/dev/null
	then
	    sleep 1
	    _i=`expr $_i + 1`
        else
	    # pmlogger socket has been set up ...
	    _dead=false
	    # give pmlogger a chance to detect that pmlc has gone away
	    # so the port is free
	    sleep 1
	    break
	fi
    done
    if $_dead
    then
	echo "now: `date`"
	echo "Oops ... primary pmlogger failed to start after $_maxdelay seconds"
	echo "pmlogger log ($_logfile) ..."
	if [ -f $_logfile ]
	then
	    cat $_logfile
	else
	    echo "Not created ... this is good as it means pmlogger_check noticed"
	fi
	echo "pmlc attempt ..."
	[ -f $tmp.err ] && cat $tmp.err
    fi
}

_filter()
{
    tee -a $seq.full \
    | sed \
	-e '/^now: /s/ .*/ DATE/' \
	-e "s@$PCP_ARCHIVE_DIR@PCP_ARCHIVE_DIR@" \
	-e "s@$_dir_hostname@HOSTNAME@" \
    | _filter_pmlogger_log
}

status=1	# failure is the default!
$sudo rm -rf $tmp $tmp.* $seq.full
trap "_cleanup; exit \$status" 0 1 2 3 15

remote=`./getpmcdhosts -L -n 1`
[ -z "$remote" ] && _notrun "Cannot find remote host running pmcd"
echo "remote=\"$remote\"" >>$seq.full

_needclean=true
myhost=`hostname`

cat <<End-of-File >$tmp.config
log mandatory on once { pmcd }
log advisory on default { kernel.all.cpu }
End-of-File

cat <<End-of-File >$tmp.control
# Installed by PCP QA test $seq on `date`
\$version=1.1
$remote         n   n   PCP_ARCHIVE_DIR/$remote          -c $tmp.config
End-of-File

$sudo cp $tmp.control ${PCP_PMLOGGERCONTROL_PATH}.d/$remote

# real QA test starts here
echo "[`date`] initially" >>$seq.full
$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p](mcd|mlogger)' >>$seq.full

# stop pmcd, and all pmloggers
#
echo "[`date`] pcp stop" >>$seq.full
_service pcp stop 2>&1 | _filter_pcp_stop
$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p](mcd|mlogger)' >>$seq.full

# from here on, don't use any "_service" wrapper ... we need to dodge
# any linking of the services and starting stuff under the covers
#
_dir_hostname=`hostname || echo localhost`
_logfile="$PCP_ARCHIVE_DIR/$_dir_hostname/pmlogger.log" 
$sudo rm -f $_logfile
echo "[`date`]" >>$seq.full
echo "pmcd not running, expect this to timeout" | tee -a $seq.full
$sudo $PCP_RC_DIR/pmlogger start 2>&1 | _filter_pcp_start
_my_wait_for_pmlogger | _filter
echo "[`date`]" >>$seq.full
$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p](mcd|mlogger)' >>$seq.full

$sudo $PCP_RC_DIR/pmcd start 2>&1 | _filter_pcp_start
_wait_for_pmcd

# the "rc" pmlogger script calls pmlogger_check and now pmlogger_check
# uses pmlogger_janitor to kill off any "lost" pmlogger's, we don't want
# pmlogger_janitor to kill the primary pmlogger we're about to start ...
#
export PMLOGGER_CHECK_SKIP_JANITOR=yes

$sudo rm -f $_logfile
echo "[`date`]" >>$seq.full
echo "pmcd running, expect this to work" | tee -a $seq.full
$sudo $PCP_RC_DIR/pmlogger start 2>&1 | _filter_pcp_start
_wait_for_pmlogger
echo "[`date`]" >>$seq.full
$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p](mcd|mlogger)' >>$seq.full
pminfo -f pmcd.pmlogger.pmcd_host | tee -a $seq.full >$tmp.tmp
if grep '"primary"' $tmp.tmp >/dev/null
then
    echo "Found primary pmlogger"
else
    echo "Error: primary pmlogger missing"
    cat $tmp.tmp
    pcp
fi
if grep '"'"$remote"'"' $tmp.tmp >/dev/null
then
    echo "Found non-primary pmlogger"
else
    # getpmcdhosts may have returned a FQDN, but hostname() on the remote
    # host may return an abbreviated name which is the hostname we see
    # in pmcd.pmlogger.pmcd_host ... try that
    #
    r=`echo $remote | sed -e 's/\..*//'`
    if grep '"'"$r"'"' $tmp.tmp >/dev/null
    then
	echo "Found non-primary pmlogger"
    else
	echo "Error: non-primary pmlogger missing"
	cat $tmp.tmp
	pcp
    fi
fi

# success, all done
status=0
exit
