To automatically check your Solaris Volume Manager configuration for errors, create a script
that the cron utility can periodically run.The following example shows a script that you can adapt and modify for
your needs.
Note - This script serves as a starting point for automating error checking for
Solaris Volume Manager. You probably need to modify this script for your own
configuration.
#
#!/bin/ksh
#ident "@(#)metacheck.sh 1.3 96/06/21 SMI"
# ident='%Z%%M% %I% %E% SMI'
#
# Copyright (c) 1999 by Sun Microsystems, Inc.
#
# metacheck
#
# Check on the status of the metadevice configuration. If there is a problem
# return a non zero exit code. Depending on options, send email notification.
#
# -h
# help
# -s setname
# Specify the set to check. By default, the 'local' set will be checked.
# -m recipient [recipient...]
# Send email notification to the specified recipients. This
# must be the last argument. The notification shows up as a short
# email message with a subject of
# "Solaris Volume Manager Problem: metacheck.who.nodename.setname"
# which summarizes the problem(s) and tells how to obtain detailed
# information. The "setname" is from the -s option, "who" is from
# the -w option, and "nodename" is reported by uname(1).
# Email notification is further affected by the following options:
# -f to suppress additional messages after a problem
# has been found.
# -d to control the supression.
# -w to identify who generated the email.
# -t to force email even when there is no problem.
# -w who
# indicate who is running the command. By default, this is the
# user-name as reported by id(1M). This is used when sending
# email notification (-m).
# -f
# Enable filtering. Filtering applies to email notification (-m).
# Filtering requires root permission. When sending email notification
# the file /etc/lvm/metacheck.setname.pending is used to
# controll the filter. The following matrix specifies the behavior
# of the filter:
#
# problem_found file_exists
# yes no Create file, send notification
# yes yes Resend notification if the current date
# (as specified by -d datefmt) is
# different than the file date.
# no yes Delete file, send notification
# that the problem is resolved.
# no no Send notification if -t specified.
#
# -d datefmt
# Specify the format of the date for filtering (-f). This option
# controls the how often re-notification via email occurs. If the
# current date according to the specified format (strftime(3C)) is
# identical to the date contained in the
# /etc/lvm/metacheck.setname.pending file then the message is
# suppressed. The default date format is "%D", which will send one
# re-notification per day.
# -t
# Test mode. Enable email generation even when there is no problem.
# Used for end-to-end verification of the mechanism and email addresses.
#
#
# These options are designed to allow integration of metacheck
# into crontab. For example, a root crontab entry of:
#
# 0,15,30,45 * * * * /usr/sbin/metacheck -f -w SVMcron \
# -d '\%D \%h' -m [email protected] [email protected]
#
# would check for problems every 15 minutes, and generate an email to
# [email protected] (and send to an email pager service) every hour when
# there is a problem. Note the \ prior to the '%' characters for a
# crontab entry. Bounced email would come back to root@nodename.
# The subject line for email generated by the above line would be
# Solaris Volume Manager Problem: metacheck.SVMcron.nodename.local
#
# display a debug line to controlling terminal (works in pipes)
decho()
{
if [ "$debug" = "yes" ] ; then
echo "DEBUG: $*" < /dev/null > /dev/tty 2>&1
fi
}
# if string $1 is in $2-* then return $1, else return ""
strstr()
{
typeset look="$1"
typeset ret=""
shift
# decho "strstr LOOK .$look. FIRST .$1."
while [ $# -ne 0 ] ; do
if [ "$look" = "$1" ] ; then
ret="$look"
fi
shift
done
echo "$ret"
}
# if string $1 is in $2-* then delete it. return result
strdstr()
{
typeset look="$1"
typeset ret=""
shift
# decho "strdstr LOOK .$look. FIRST .$1."
while [ $# -ne 0 ] ; do
if [ "$look" != "$1" ] ; then
ret="$ret $1"
fi
shift
done
echo "$ret"
}
merge_continued_lines()
{
awk -e '\
BEGIN { line = "";} \
$NF == "\\" { \
$NF = ""; \
line = line $0; \
next; \
} \
$NF != "\\" { \
if ( line != "" ) { \
print line $0; \
line = ""; \
} else { \
print $0; \
} \
}'
}
# trim out stuff not associated with metadevices
find_meta_devices()
{
typeset devices=""
# decho "find_meta_devices .$*."
while [ $# -ne 0 ] ; do
case $1 in
d+([0-9]) ) # metadevice name
devices="$devices $1"
;;
esac
shift
done
echo "$devices"
}
# return the list of top level metadevices
toplevel()
{
typeset comp_meta_devices=""
typeset top_meta_devices=""
typeset devices=""
typeset device=""
typeset comp=""
metastat$setarg -p | merge_continued_lines | while read line ; do
echo "$line"
devices=`find_meta_devices $line`
set -- $devices
if [ $# -ne 0 ] ; then
device=$1
shift
# check to see if device already refered to as component
comp=`strstr $device $comp_meta_devices`
if [ -z $comp ] ; then
top_meta_devices="$top_meta_devices $device"
fi
# add components to component list, remove from top list
while [ $# -ne 0 ] ; do
comp=$1
comp_meta_devices="$comp_meta_devices $comp"
top_meta_devices=`strdstr $comp $top_meta_devices`
shift
done
fi
done > /dev/null 2>&1
echo $top_meta_devices
}
#
# - MAIN
#
METAPATH=/usr/sbin
PATH=//usr/bin:$METAPATH
USAGE="usage: metacheck [-s setname] [-h] [[-t] [-f [-d datefmt]] \
[-w who] -m recipient [recipient...]]"
datefmt="%D"
debug="no"
filter="no"
mflag="no"
set="local"
setarg=""
testarg="no"
who=`id | sed -e 's/^uid=[0-9][0-9]*(//' -e 's/).*//'`
while getopts d:Dfms:tw: flag
do
case $flag in
d) datefmt=$OPTARG;
;;
D) debug="yes"
;;
f) filter="yes"
;;
m) mflag="yes"
;;
s) set=$OPTARG;
if [ "$set" != "local" ] ; then
setarg=" -s $set";
fi
;;
t) testarg="yes";
;;
w) who=$OPTARG;
;;
\?) echo $USAGE
exit 1
;;
esac
done
# if mflag specified then everything else part of recipient
shift `expr $OPTIND - 1`
if [ $mflag = "no" ] ; then
if [ $# -ne 0 ] ; then
echo $USAGE
exit 1
fi
else
if [ $# -eq 0 ] ; then
echo $USAGE
exit 1
fi
fi
recipients="$*"
curdate_filter=`date +$datefmt`
curdate=`date`
node=`uname -n`
# establish files
msg_f=/tmp/metacheck.msg.$$
msgs_f=/tmp/metacheck.msgs.$$
metastat_f=/tmp/metacheck.metastat.$$
metadb_f=/tmp/metacheck.metadb.$$
metahs_f=/tmp/metacheck.metahs.$$
pending_f=/etc/lvm/metacheck.$set.pending
files="$metastat_f $metadb_f $metahs_f $msg_f $msgs_f"
rm -f $files > /dev/null 2>&1
trap "rm -f $files > /dev/null 2>&1; exit 1" 1 2 3 15
# Check to see if metadb is capable of running
have_metadb="yes"
metadb$setarg > $metadb_f 2>&1
if [ $? -ne 0 ] ; then
have_metadb="no"
fi
grep "there are no existing databases" < $metadb_f > /dev/null 2>&1
if [ $? -eq 0 ] ; then
have_metadb="no"
fi
grep "/dev/md/admin" < $metadb_f > /dev/null 2>&1
if [ $? -eq 0 ] ; then
have_metadb="no"
fi
# check for problems accessing metadbs
retval=0
if [ "$have_metadb" = "no" ] ; then
retval=1
echo "metacheck: metadb problem, can't run '$METAPATH/metadb$setarg'" \
>> $msgs_f
else
# snapshot the state
metadb$setarg 2>&1 | sed -e '1d' | merge_continued_lines > $metadb_f
metastat$setarg 2>&1 | merge_continued_lines > $metastat_f
metahs$setarg -i 2>&1 | merge_continued_lines > $metahs_f
#
# Check replicas for problems, capital letters in the flags
# indicate an error, fields are seperated by tabs.
#
problem=`awk < $metadb_f -F\t '{if ($1 ~ /[A-Z]/) print $1;}'`
if [ -n "$problem" ] ; then
retval=`expr $retval + 64`
echo "\
metacheck: metadb problem, for more detail run:\n\t$METAPATH/metadb$setarg -i" \
>> $msgs_f
fi
#
# Check the metadevice state
#
problem=`awk < $metastat_f -e \
'/State:/ {if ($2 != "Okay" && $2 != "Resyncing") print $0;}'`
if [ -n "$problem" ] ; then
retval=`expr $retval + 128`
echo "\
metacheck: metadevice problem, for more detail run:" \
>> $msgs_f
# refine the message to toplevel metadevices that have a problem
top=`toplevel`
set -- $top
while [ $# -ne 0 ] ; do
device=$1
problem=`metastat $device | awk -e \
'/State:/ {if ($2 != "Okay" && $2 != "Resyncing") print $0;}'`
if [ -n "$problem" ] ; then
echo "\t$METAPATH/metastat$setarg $device" >> $msgs_f
# find out what is mounted on the device
mp=`mount|awk -e '/\/dev\/md\/dsk\/'$device'[ \t]/{print $1;}'`
if [ -n "$mp" ] ; then
echo "\t\t$mp mounted on $device" >> $msgs_f
fi
fi
shift
done
fi
#
# Check the hotspares to see if any have been used.
#
problem=""
grep "no hotspare pools found" < $metahs_f > /dev/null 2>&1
if [ $? -ne 0 ] ; then
problem=`awk < $metahs_f -e \
'/blocks/ { if ( $2 != "Available" ) print $0;}'`
fi
if [ -n "$problem" ] ; then
retval=`expr $retval + 256`
echo "\
metacheck: hot spare in use, for more detail run:\n\t$METAPATH/metahs$setarg -i" \
>> $msgs_f
fi
fi
# If any errors occurred, then mail the report
if [ $retval -ne 0 ] ; then
if [ -n "$recipients" ] ; then
re=""
if [ -f $pending_f ] && [ "$filter" = "yes" ] ; then
re="Re: "
# we have a pending notification, check date to see if we resend
penddate_filter=`cat $pending_f | head -1`
if [ "$curdate_filter" != "$penddate_filter" ] ; then
rm -f $pending_f > /dev/null 2>&1
else
if [ "$debug" = "yes" ] ; then
echo "metacheck: email problem notification still pending"
cat $pending_f
fi
fi
fi
if [ ! -f $pending_f ] ; then
if [ "$filter" = "yes" ] ; then
echo "$curdate_filter\n\tDate:$curdate\n\tTo:$recipients" \
> $pending_f
fi
echo "\
Solaris Volume Manager: $node: metacheck$setarg: Report: $curdate" >> $msg_f
echo "\
--------------------------------------------------------------" >> $msg_f
cat $msg_f $msgs_f | mailx -s \
"${re}Solaris Volume Manager Problem: metacheck.$who.$set.$node" $recipients
fi
else
cat $msgs_f
fi
else
# no problems detected,
if [ -n "$recipients" ] ; then
# default is to not send any mail, or print anything.
echo "\
Solaris Volume Manager: $node: metacheck$setarg: Report: $curdate" >> $msg_f
echo "\
--------------------------------------------------------------" >> $msg_f
if [ -f $pending_f ] && [ "$filter" = "yes" ] ; then
# pending filter exista, remove it and send OK
rm -f $pending_f > /dev/null 2>&1
echo "Problem resolved" >> $msg_f
cat $msg_f | mailx -s \
"Re: Solaris Volume Manager Problem: metacheck.$who.$node.$set" $recipients
elif [ "$testarg" = "yes" ] ; then
# for testing, send mail every time even thought there is no problem
echo "Messaging test, no problems detected" >> $msg_f
cat $msg_f | mailx -s \
"Solaris Volume Manager Problem: metacheck.$who.$node.$set" $recipients
fi
else
echo "metacheck: Okay"
fi
fi
rm -f $files > /dev/null 2>&1
exit $retval
For information on invoking scripts by using the cron utility, see the cron(1M)
man page.