r6 - 22 Jan 2009 - 16:24:38 - JadirSilvaYou are here: TWiki >  Main Web > TWikiUsers > JadirSilva > TestLink
    1	# SCRIPT: Cluster Monitoring Script
    2	# Date..: Mon 20 Oct 2008 02:48:13 PM BRST
    3	# Author: Jadir Marra da Silva<jadir.silva13@gmail.com>
    4	#
    5	#####################################################
    6	# Jadir Silva:
    7	#       Mon 02 Nov 2008
    8	#         + acrescentado o teste para verificar o espaco no /scratch
    9	#           do nodes conforme sugerido por Sergio Lietti.
   10	
   11	# Jadir Silva:
   12	#       Mon 03 Nov 2008 09:47:20 AM BRST
   13	#         + acrescentado uma definicão condicional para MAILTO,
   14	#           MAILSUBJECT e CARBON_COPY para evitar o envio de emails
   15	#           quando estiver depurando o script
   16	
   17	# Jadir Silva:
   18	#       Mon 10 Nov 2008 11:21:26 AM BRST
   19	#         + acrescentado alerta via email para o site verify,
   20	
   21	# Jadir Silva:
   22	#       Tue 11 Nov 2008 09:09:13 AM BRST
   23	#         + acrescentado comando para remover o arquivo temporario com
   24	#           o conteudo do email de alerta.
   25	
   26	# Jadir Silva:
   27	#       Tue 11 Nov 2008 12:43:04 PM BRST
   28	#         + alterado o IDLE_THRESHOLD de 699 para 999
   29	#           seguindo orientacao do Lietti.
   30	
   31	# Jadir Silva:
   32	#       Thu 27 Nov 2008 10:22:34 AM BRST
   33	#         + correcao de pequeno problema que impedia
   34	#           o envio de email com numero de nodes com o condor down.
   35	
   36	TIER="T2_BR_SPRACE"
   37	SEND_MAIL="N"
   38	GRID_USER="mdias"
   39	MAIL_BODY=`mktemp /tmp/site_verify.XXXXXXX`
   40	MAILFROM="root@osg-ce.sprace.org.br"
   41	MAILTO="sprace_ops@googlegroups.com"
   42	CARBON_COPY="jadir.silva13@gmail.com"
   43	MAILSUBJECT="SPRACE - Monitoramento Automatico - `date`"
   44	
   45	STATUS_PAGE="/var/www/html/spracemon.html"
   46	MAIN_SERVERS="acs.grid osgce.grid osgse.grid storage01.grid storage02.grid"
   47	LOAD_THRESHOLD=9
   48	SERVERS_LOAD_THRESHOLD=14
   49	POOL_THRESHOLD=90
   50	IDLE_THRESHOLD=999
   51	#GANGLIA_LINK='http://prod-frontend.hepgrid.uerj.br/ganglia/'
   52	GANGLIA_LINK='http://osg-ce.sprace.org.br/ganglia'
   53	DCACHE_URL="http://osg-se.sprace.org.br:2288"
   54	#DCACHE_URL="http://cdfdca.fnal.gov:2288/cellInfo"
   55	
   56	# Captura uma lista com todos os nodes do cluster
   57	NODE_LIST=`links -source $GANGLIA_LINK | grep 'OPTION.*\.grid' | sed 's/<[^>]*>/ /g'`
   58	NODE_LIST=`echo $NODE_LIST | sed 's/\.grid//g'`
   59	NODE_LIST=`echo $NODE_LIST | sed 's/osgce//g;s/storage01//g;s/storage02//g;s/osgse//g;s/acs//g'`
   60	
   61	NODE_LIST=`cat /root/bin/cluster.list`
   62	
   63	PHEDEX_PROD_URL="http://cmsweb.cern.ch/phedex/prod/Components::Status"
   64	PHEDEX_DEBG_URL="http://cmsweb.cern.ch/phedex/debug/Components::Status"
   65	
   66	MSG=""
   67	FOOTER="<br /><font size=2>This is an automatic email, please do not reply</font><br />Message send in `date`"
   68	
   69	function Header(){
   70	  echo "<h2>$1</h2" >> $MAIL_BODY
   71	  echo "<hr />" >> $MAIL_BODY
   72	}
   73	
   74	function OpenTable(){
   75	  echo "<table size=100%>" >> $MAIL_BODY
   76	}
   77	
   78	function CloseTable(){
   79	  echo "</table>" >> $MAIL_BODY
   80	}
   81	
   82	function OpenTbLine(){
   83	  echo "<tr>" >> $MAIL_BODY
   84	}
   85	
   86	function CloseTbLine(){
   87	  echo "</tr>" >> $MAIL_BODY
   88	}
   89	
   90	function OpenCell(){
   91	  echo '<td valign="top">' >> $MAIL_BODY
   92	}
   93	
   94	function CloseCell(){
   95	  echo "</td>" >> $MAIL_BODY
   96	}
   97	
   98	function Link(){
   99	  echo "For details "'<a href="'$1'">click here</a>' >> $MAIL_BODY
  100	}
  101	
  102	function WriteLn(){
  103	  echo "$1<br />" >> $MAIL_BODY
  104	}
  105	
  106	function Write(){
  107	  echo "$1 " >> $MAIL_BODY
  108	}
  109	
  110	function WriteStatusPage(){
  111	  cp $MAIL_BODY $STATUS_PAGE
  112	}
  113	
  114	function InitMail(){
  115	  echo "<?xml version=\"1.0\"?>" >> $MAIL_BODY
  116	  echo "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" >> $MAIL_BODY
  117	  echo "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns:exsl=\"http://exslt.org/common\" lang=\"en\" xml:lang=\"en\">" >> $MAIL_BODY
  118	  echo "<head>" >> $MAIL_BODY
  119	  echo "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />" >> $MAIL_BODY
  120	  echo "<link rel=\"stylesheet\" type=\"text/css\" href=\"http://dashb-cms-sam.cern.ch/dashboard/templates/css/samvisualization.css\" />" >> $MAIL_BODY
  121	  echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/dojo-0.4.1rc3-ajax/dojo.js\"></script>" >> $MAIL_BODY
  122	  echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/overlibmws/overlibmws.js\"></script>" >> $MAIL_BODY
  123	  echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/overlibmws/overlibmws_iframe.js\"></script>" >> $MAIL_BODY  echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/overlibmws/overlibmws_hide.js\"></script>" >> $MAIL_BODY
  124	  echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/sam/common.js\"></script>" >> $MAIL_BODY
  125	  echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/sam/latestResultsView.js\"></script>" >> $MAIL_BODY
  126	  echo "<title>SPRACE Monitoring script</title>" >> $MAIL_BODY
  127	  echo "</head>" >> $MAIL_BODY
  128	  echo "<html>" >> $MAIL_BODY
  129	  echo "  <h1>SPRACE Monitoring Report</h1>" >> $MAIL_BODY
  130	  echo "  <br /><br /><h2>Test start at `date`</h2>" >> $MAIL_BODY
  131	  echo "  <body style=\"background-image: url(http://osg-ce.sprace.org.br/banner-unico-aa.png);background-repeat: no-repeat\">" >> $MAIL_BODY
  132	}
  133	
  134	function CloseMail(){
  135	  echo "<h2>Test done at `date`</h2>" >> $MAIL_BODY
  136	  echo "<br />" >> $MAIL_BODY
  137	  echo "Report generated by monitor.sh script, developed by Jadir Silva with support of Allan Szu<br>" >> $MAIL_BODY
  138	  echo "and some suggestions from Sergio Lietti following steps defined by Marco Dias in " >> $MAIL_BODY
  139	  echo "<a href='http://www.sprace.org.br/Twiki/bin/view/Main/EntryDescriptionNo62'>[1]</a>." >> $MAIL_BODY
  140	  echo "<br /><br /><font size=2>Obs.: This script still under development, if you have any opinion,<br />" >> $MAIL_BODY
  141	  echo "contact me at jadir.silva13@gmail.com</font>" >> $MAIL_BODY
  142	  echo "  </body>" >> $MAIL_BODY
  143	  echo "</html>" >> $MAIL_BODY
  144	}
  145	
  146	function SendMail(){
  147	cat - $1 <<HERE | /usr/sbin/sendmail -oi -t
  148	From: $2
  149	To: $3
  150	Cc: $4
  151	Subject: $5
  152	Content-Type: text/html; charset=us-ascii
  153	Content-Transfer-Encoding: 7bit
  154	MIME-Version: 1.0
  155	
  156	HERE
  157	}
  158	
  159	SendMsgToTeam(){
  160	if [ "`basename $0`" == "monitor_debug.sh" ];then
  161	  MAILSUBJECT="SPRACE - Debug"
  162	  CARBON_COPY="jadirmarra@yahoo.com.br"
  163	  MAILTO="jadir.silva13@gmail.com"
  164	fi
  165	
  166	ReportMail=`mktemp /tmp/ReportMail.XXXXXX`
  167	echo "$MSG" > $ReportMail
  168	SendMail "$ReportMail" "$MAILFROM" "$MAILTO" "$CARBON_COPY" "$MAILSUBJECT"
  169	#rm -fr $ReportMail
  170	}
  171	
  172	function ReportToTeam(){
  173	 case "$1" in
  174	   idle)
  175	       MSG="$MSG<hr /><h2>Lot of Jobs in idle state</h2><br>
  176	       There are $2 in idle state on the farm.<br>"
  177	    ;;
  178	   sam-error)
  179	       MSG="$MSG<hr /><h2>Error on SAM test</h2><br>
  180	       The SAM test presents some errors $2.<br>"
  181	    ;;
  182	    dcache)
  183	       MSG="$MSG<hr /><h2>Error in some services of dcache</h2><br>
  184	       DCache have $2 stopped services.<br>"
  185	    ;;
  186	    dpool)
  187	       MSG="$MSG<hr /><h2>Low space on some pools in dcache</h2><br />
  188	       DCache has some pools with $2 of space used.<br>"
  189	    ;;
  190	    server-load)
  191	       MSG="$MSG<hr /><h2>Load of main servers</h2><br />
  192	       $2 beyond $SERVERS_LOAD_THRESHOLD.<br>"
  193	    ;;
  194	    node-load)
  195	       MSG="$MSG<hr /><h2>Load dos nodes</h2><br />
  196	       $2 beyond $LOAD_THRESHOLD.<br />"
  197	    ;;
  198	    node_down)
  199	       MSG="$MSG<hr /><h2>Node Down</h2><br />
  200	       $2 down.<br>$FOOTER"
  201	    ;;
  202	    condor_down)
  203	       MSG="$MSG<hr /><h2>Condor Down</h2><br />
  204	       $2 with condor stopped.<br>"
  205	    ;;
  206	    low_disk)
  207	       MSG="$MSG<hr /><h2>Low space on scratch in following nodes</h2><br />
  208	       $2 <br />"
  209	    ;;
  210	    job_robot)
  211	       MSG="$MSG<hr /><h2>JobRobot with low efficiency</h2><br />
  212	       $2% <br />"
  213	    ;;
  214	    old_jobs)
  215	       MSG="$MSG<hr /><h2>Jobs more than 2 days on the farm</h2><br />
  216	       $2 <br />"
  217	    ;;
  218	    site_verify)
  219	       MSG="$MSG<hr /><h2>Site verify failed.</h2><br />
  220	       $2 <br />"
  221	    ;;
  222	    phedex_down)
  223	       MSG="$MSG<hr /><h2>Phedex Agents status.</h2><br />
  224	       $2 <br />"
  225	    ;;
  226	 esac
  227	}
  228	
  229	function JobRobotTest(){
  230	  JOB_ROBOT_URL1="http://belforte.home.cern.ch/belforte/JobRobot/summary_"`date '+%y%m%d' -d "$1 day ago"`".html"
  231	  JOBROBOT1=`links -source $JOB_ROBOT_URL1`
  232	  POSITION=`echo "$JOBROBOT1" | grep -n '<td align=left><b> T2_BR_SPRACE' | awk -F: '{print $1}'`
  233	
  234	  if [ "$POSITION" != "" ];then
  235	    SED_DATA="$POSITION,$((POSITION+5))p"
  236	    JOBROBOT1=`echo "$JOBROBOT1" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'`
  237	    EFICIENCIA=`echo $JOBROBOT1 | awk '{print $6}'`
  238	
  239	    if [ "$EFICIENCIA" == "" ];then
  240	      WriteLn "Efficiency : -- -- --"
  241	    else
  242	      if [ $EFICIENCIA -lt 60 ];then
  243	        if [ "$1" == "1" ];then
  244	          ReportToTeam "job_robot" "$EFICIENCIA"
  245	        fi
  246	        EFI="Efficiency : $EFICIENCIA% <font color=#ff0000>(below expected)</font> (Test done at "`date '+%d/%m/%Y' -d "$1 day ago"`")."
  247	        WriteLn "$EFI"
  248	      else
  249	        EFI="Efficiency : $EFICIENCIA% Ok (Test done at "`date '+%d/%m/%Y' -d "$1 day ago"`")."
  250	        WriteLn "$EFI"
  251	      fi
  252	    fi
  253	  else
  254	    WriteLn "Efficiency : -- -- --"
  255	  fi
  256	}
  257	
  258	# inicializa o ambiente do OSG
  259	source /OSG/setup.sh
  260	
  261	InitMail
  262	#####################################################
  263	# espaco reservado para testes nao digite nada aqui
  264	
  265	######################################################
  266	
  267	# PASSO 1. verifica quantos nodes estao down
  268	Header "Hosts down"
  269	
  270	HDOWN=`links -source $GANGLIA_LINK | grep 'class=down' | sed 's/<[^>]*>/ /g' | awk '{ print $1,"|" }'`
  271	HDOWN=`echo $HDOWN | sed 's/\.grid//g'`
  272	
  273	
  274	UP_LIST=$NODE_LIST
  275	OLD_IFS=$IFS
  276	
  277	if [ "$HDOWN" == "" ];then
  278	  WriteLn "No hosts down."
  279	else
  280	  IFS='|'
  281	  for host in $HDOWN;do
  282	    IFS=$OLD_IFS
  283	    NODE_NAME=`echo $host | awk '{print $1}'`
  284	    IFS='|'
  285	    Write "$NODE_NAME&nbsp;&nbsp;"
  286	    UP_LIST=`echo $UP_LIST | sed "s/$NODE_NAME/ /g"`
  287	    Link "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$NODE_NAME.grid"
  288	    Write "<br />"
  289	  done
  290	  ReportToTeam "node_down" "$HDOWN"
  291	fi
  292	
  293	
  294	IFS=$OLD_IFS
  295	
  296	# PASSO 2. verifica os host's que estao load > 10
  297	Header "Hosts with load equal/above $((LOAD_THRESHOLD+1))"
  298	NADA=0
  299	LOADS=""
  300	for a in $UP_LIST;do
  301	 node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g'`
  302	 LOAD=`echo $node_load | awk '{print $1}'`
  303	 LOAD=`echo $LOAD | sed 's/\.[0-9][0-9]//g'`
  304	 if [ $LOAD -gt $LOAD_THRESHOLD ];then
  305	   WriteLn "$a load : $LOAD"
  306	   LOADS="$LOADS <BR /> $a(load=$LOAD)"
  307	   NADA=1
  308	 fi
  309	done
  310	
  311	if [ $NADA == 0 ];then
  312	  WriteLn "No host with load equal/above $((LOAD_THRESHOLD+1))."
  313	  NADA=0
  314	else
  315	  ReportToTeam "node-load" "$MSG"
  316	fi
  317	
  318	NADA=0
  319	Header "Load of main servers <br /><font size=2>$MAIN_SERVERS</font>"
  320	
  321	# PASSO 3. Load acima de 15 nos principais servidores.
  322	LOAD_NODE=""
  323	for a in $MAIN_SERVERS ;do
  324	# node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g' | grep '[1-9][5-9]\.'`
  325	 node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g'`
  326	 LOAD=`echo $node_load | awk '{print $1}'`
  327	 LOAD=`echo $LOAD | sed 's/\.[0-9][0-9]//g'`
  328	 if [ $LOAD -gt $SERVERS_LOAD_THRESHOLD ];then
  329	   LOAD_NODE="$a(load=$LOAD)"
  330	   WriteLn "$a load : $LOAD"
  331	   NADA=1
  332	 fi
  333	done
  334	
  335	
  336	if [ $NADA == 0 ];then
  337	  WriteLn "No host with load equal/above $((SERVERS_LOAD_THRESHOLD+1))."
  338	  NADA=0
  339	else
  340	  ReportToTeam "server-load" "$LOAD_NODE"
  341	fi
  342	
  343	# PASSO 4.
  344	# Site verify
  345	# executa o script site_verify.pl da OSG.
  346	SITE_VERIFY=`su - $GRID_USER -c "source /opt/osg-1.0.0/setup.sh;/opt/osg-1.0.0/verify/site_verify.pl" | grep -i " FAIL"`
  347	
  348	Header "Siteverify.pl status"
  349	
  350	if [ "$SITE_VERIFY" == "" ];then
  351	  WriteLn "Site verify test: <b>SUCCESS</b>"
  352	else
  353	  WriteLn "Errors founded:"
  354	  WriteLn "$SITE_VERIFY"
  355	  ReportToTeam "site_verify" "$SITE_VERIFY"
  356	fi
  357	
  358	# verifica se o condor esta rodando em todos os nos
  359	Header "Condor status"
  360	CRUNNING=0
  361	CSTOPPED=0
  362	CONDOR_MSG=""
  363	for node in $UP_LIST;do
  364	  IS_RUNNING=`ssh $node ps -fu condor | grep -v UID`
  365	  if [ "$IS_RUNNING" == "" ];then
  366	    WriteLn "$node with condor <font color=ff0000>stopped</font>."
  367	    CSTOPPED=$((CSTOPPED+1))
  368	    CONDOR_MSG="$CONDOR_MSG <BR />$node"
  369	  else
  370	    CRUNNING=$((CRUNNING+1))
  371	  fi
  372	
  373	done
  374	
  375	if [ "$CSTOPPED" == "0" ];then
  376	  WriteLn "<B>Condor running on all active nodes</B>"
  377	else
  378	  ReportToTeam "condor_down" "$CONDOR_MSG"
  379	  WriteLn "Condor running on $CRUNNING nodes and stopped on $CSTOPPED nodes."
  380	fi
  381	
  382	Header "Job status"
  383	JOBS_STATS=`condor_q | grep running`
  384	IDLE_JOBS=`echo $JOBS_STATS | awk '{print $3}'`
  385	TOTAL_JOBS=`echo $JOBS_STATS | awk '{print $1}'`
  386	RUN_JOBS=`echo $JOBS_STATS | awk '{print $5}'`
  387	HELD_JOBS=`echo $JOBS_STATS | awk '{print $7}'`
  388	
  389	OpenTable
  390	OpenTbLine
  391	OpenCell
  392	WriteLn "Running: $RUN_JOBS"
  393	if [ $IDLE_JOBS -gt $IDLE_THRESHOLD ];then
  394	  WriteLn "Idle.......: <b><font color=ff0000>$IDLE_JOBS</font> Warning!!!</b>"
  395	  ReportToTeam "idle" "$IDLE_JOBS"
  396	else
  397	  WriteLn "Idle.......: $IDLE_JOBS"
  398	fi
  399	WriteLn "Held.......: $HELD_JOBS"
  400	WriteLn "Total......: $TOTAL_JOBS"
  401	CloseCell
  402	OpenCell
  403	WriteLn "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp"
  404	CloseCell
  405	OpenCell
  406	WriteLn "If has any job <b>held</b> or more than <b>$((IDLE_THRESHOLD+1))</b> jobs in <b>idle</b><br />Please report to sprace_ops@yahoo.com.br"
  407	CloseCell
  408	CloseTbLine
  409	CloseTable
  410	
  411	Header "Jobs with more than 2 days on the farm"
  412	JOBS_RUNNING=`condor_q -run | grep [2-9]+ `
  413	if [ "$JOBS_RUNNING" == "" ];then
  414	  WriteLn "No jobs more than 2 days on the farm"
  415	else
  416	  MORE2DAYS=`echo $JOBS_RUNNING | sed 's/\.grid/\.grid<br \/>/g'`
  417	  WriteLn "$MORE2DAYS"
  418	  ReportToTeam "old_jobs" "$MORE2DAYS"
  419	fi
  420	
  421	Header "Farm occupation"
  422	FarmOcupation=`condor_q -run | grep -v "OWNER" | grep -v "Submitter" | awk '{print $2}' | sort | uniq -c | sed 1d`
  423	WriteLn "<pre>$FarmOcupation</pre>"
  424	
  425	Header "SAM test"
  426	
  427	SAM=`links -source "http://dashb-cms-sam.cern.ch/dashboard/request.py/latestresultssmry?siteSelect3=T2T1T0&serviceTypeSelect3=vo&sites=T2_BR_SPRACE&services=CE&services=SRMv2&tests=1301&tests=133&tests=111&tests=6&tests=1261&tests=76&tests=64&tests=20&tests=281&tests=882&exitStatus=all" | \
  428	  sed -e '/latestresultssmrytable/!d' | \
  429	  awk '{ print substr($0,index($0,"latestresultssmrytable")) }' |\
  430	  sed 's/target=\"\_blank\">//g'`
  431	
  432	SAM_LINK='"http://dashb-cms-sam.cern.ch/dashboard/request.py/'$SAM
  433	SAM_LINK=`echo $SAM_LINK | sed 's/\"//g'`
  434	SAM_TABLE=`links -source $SAM_LINK | sed 's/\/dashboard/http\:\/\/dashb-cms-sam\.cern\.ch\/dashboard/g'`
  435	SAM_TABLE=`echo $SAM_TABLE | sed 's/<title>SAM-Latest Results<\/title><\/head><body>//g'`
  436	SAM_TABLE=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"<link rel")) }'`
  437	SAM_TABLE=`echo $SAM_TABLE | sed 's/<\/body><\/html>'//g`
  438	SAM_TABLE=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"<div"))}'`
  439	#SAM_TABLE=`echo $SAM_TABLE | sed 's/Service Type/Tipo de Servico/g;s/Service Name/Nome do Servico/g;s/Sitename/Sitio/g'`
  440	
  441	SAM_RED1_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF0000"),24) }' `
  442	SAM_RED2_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF6666"),24) }' `
  443	SAM_RED3_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF9999"),24) }' `
  444	
  445	SAM_CRIT1_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#CC00CC"),24) }' `
  446	SAM_CRIT2_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF33FF"),24) }' `
  447	SAM_CRIT3_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF99FF"),24) }' `
  448	
  449	if [ "$SAM_RED1_ERROR" == "background-color:#FF0000" ];then
  450	  ReportToTeam "sam-error" "Dark red code"
  451	fi
  452	
  453	if [ "$SAM_RED2_ERROR" == "background-color:#FF6666" ];then
  454	  ReportToTeam "sam-error" "Light red code"
  455	fi
  456	
  457	if [ "$SAM_RED3_ERROR" == "background-color:#FF9999" ];then
  458	  ReportToTeam "sam-error"  "Lightest red code"
  459	fi
  460	
  461	if [ "$SAM_CRIT1_ERROR" == "background-color:#CC00CC" ];then
  462	  ReportToTeam "sam-error"  "Dark critical code"
  463	fi
  464	
  465	if [ "$SAM_CRIT2_ERROR" == "background-color:#FF33FF" ];then
  466	  ReportToTeam "sam-error"  "Light critical code"
  467	fi
  468	
  469	if [ "$SAM_CRIT3_ERROR" == "background-color:#FF99FF" ];then
  470	  ReportToTeam "sam-error"  "Lightest critical code"
  471	fi
  472	
  473	echo "$SAM_TABLE" >> $MAIL_BODY
  474	
  475	# Verifica os servicos do d-cache
  476	Header "DCache status"
  477	
  478	CELL_INFO=`links -source $DCACHE_URL/cellinfo`
  479	USAG_INFO=`links -source $DCACHE_URL/usageInfo`
  480	CELL_STAT=`echo "$CELL_INFO" | sed 's/<[^>]*>/ /g' | grep -i offline | wc -l`
  481	USAG_STAT=`echo "$USAG_INFO" | sed 's/<[^>]*>/ /g'`
  482	
  483	if [ $CELL_STAT -gt 0 ];then
  484	  WriteLn "$CELL_STAT dcache services are stopped"
  485	  ReportToTeam "dcache" "$CELL_STAT"
  486	else
  487	  WriteLn "All dcache services(<i>daemons</i>) ok.<br />"
  488	fi
  489	
  490	LINE=""
  491	TOTAL=0
  492	FREE=0
  493	REPORTAR=""
  494	echo "$USAG_INFO" | egrep '(cell|total|free|precious)' | grep '<td' \
  495	| grep -v 'layout' | sed 's/<td class=\"//g;s/\">/ /g;s/<\/td>//g' | while read celula valor ;do
  496	
  497	  if [ "$celula" == "cell" ];then
  498	    LINE="$LINE $valor"
  499	  fi
  500	
  501	  if [ "$celula" == "total" ];then
  502	    TOTAL="$valor"
  503	  fi
  504	
  505	  if [ "$celula" == "free" ];then
  506	    FREE="$valor"
  507	  fi
  508	
  509	  if [ "$celula" == "precious" ];then
  510	    PERCENT=`cat - << HERE | bc
  511	scale=0
  512	100-((100*$FREE)/$TOTAL)
  513	HERE`
  514	
  515	    if [ $PERCENT -gt $POOL_THRESHOLD ];then
  516	      REPORTAR="$REPORTAR <br />$LINE with $PERCENT occupation"
  517	      PERCENT="<font color=#ff0000>$PERCENT%</font>"
  518	    else
  519	      PERCENT="$PERCENT%"
  520	    fi
  521	
  522	    WriteLn "$LINE with $PERCENT ocuppation"
  523	    LINE=""
  524	  fi
  525	done
  526	
  527	if [ "$REPORTAR" != "" ];then
  528	  ReportToTeam "dpool" "$REPORTAR"
  529	fi
  530	
  531	#######################
  532	# Lietti sugeriu acrescentar no relatorio o espaco ocupado pelo/scratch dos nodes
  533	# entao desenvolvi o codigo abaixo para fazer isto.
  534	x=0
  535	Header "Ocuppation of /scratch on nodes"
  536	WriteLn "<h3>Only nodes less than 8Gb.</h3>"
  537	IFS=$OLD_IFS
  538	LOW_DISK_NODES=""
  539	nodes_to_save=""
  540	for node in $UP_LIST;do
  541	  if [ "$node" = "`/bin/hostname -s`" ]; then
  542	    eval "df -h /scratch"
  543	  else
  544	    saida=`ssh $node "df /scratch"`
  545	    size=`echo $saida | awk '{print $11}'`
  546	    perc=`echo $saida | awk '{print $12}'`
  547	    if [ $size -lt 8388608 ];then
  548	      x=$(($x+1))
  549	      if [ $size -lt 1048576 ];then
  550	        node_disk_space="$node"'('"$(($size/1024))Mb"') '
  551	        WriteLn "$node_disk_space"
  552	        LOW_DISK_NODES="$LOW_DISK_NODES $node_disk_space<br />"
  553	        nodes_to_save="$nodes_to_save $node"
  554	      else
  555	        node_disk_space="$node"'('"$(($size/1048576))Gb"') '
  556	        WriteLn "$node_disk_space"
  557	        LOW_DISK_NODES="$LOW_DISK_NODES $node_disk_space<br />"
  558	        nodes_to_save="$nodes_to_save $node"
  559	      fi
  560	    fi
  561	  fi
  562	done
  563	
  564	if [ "$LOW_DISK_NODES" != "" ];then
  565	  ReportToTeam "low_disk" "$LOW_DISK_NODES"
  566	  echo "$nodes_to_save" > /tmp/nodes_full.txt
  567	else
  568	  WriteLn "No node with low space on /scratch"
  569	fi
  570	
  571	WriteLn ""
  572	
  573	Header "JobRobot Status"
  574	
  575	JobRobotTest 1
  576	JobRobotTest 2
  577	JobRobotTest 3
  578	JobRobotTest 4
  579	JobRobotTest 5
  580	JobRobotTest 6
  581	
  582	if [ "$DEBUG" == "monitor_debug.sh" ];then
  583	# codigo de teste ou instavel deve ficar aqui ####################
  584	Header "CEMon Status"
  585	ldap=`ldapsearch -x -LLL -p 2170 -h is.grid.iu.edu -b mds-vo-name=SPRACE,mds-vo-name=local,o=grid`
  586	cods=`ssh node34 "source /OSG/setup.sh;condor_status -pool osg-ress-1.fnal.gov -l -constraint 'GlueCEInfoHostName == \"osg-ce.sprace.org.br\"'"`
  587	
  588	LDAP_STATUS=`echo $ldap | grep `
  589	
  590	# fim da area de teste ###########################################
  591	fi
  592	
  593	# status dos agentes do phedex
  594	Header "Phedex Agents Status"
  595	
  596	PRODUCTION_STATUS=`links -source $PHEDEX_PROD_URL`
  597	DEBUG_STATUS=`links -source $PHEDEX_DEBG_URL`
  598	
  599	PROD_POSITION=`echo "$PRODUCTION_STATUS" | grep -n "$TIER" | awk -F: '{print $1}'`
  600	DEBG_POSITION=`echo "$DEBUG_STATUS" | grep -n "$TIER" | awk -F: '{print $1}'`
  601	
  602	TBL_CODE=""
  603	TBL_CODE2=""
  604	
  605	if [ "$PROD_POSITION" != "" ];then
  606	  SED_DATA="$((PROD_POSITION+1)),$((PROD_POSITION+2))p"
  607	  PROD_CODE=`echo "$PRODUCTION_STATUS" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'`
  608	  TBL_CODE=`echo "$PRODUCTION_STATUS" | sed -n $SED_DATA`
  609	
  610	  IS_AGENT_DOWN=`echo "$PROD_CODE" | grep DOWN`
  611	
  612	  if [ "$IS_AGENT_DOWN" != "" ];then
  613	    ReportToTeam "phedex_down" "Phedex(production) agent down"
  614	    WriteLn "Phedex(production) agents down."
  615	  else
  616	    WriteLn "Phedex(production) agents are OK."
  617	  fi
  618	fi
  619	
  620	
  621	if [ "$DEBG_POSITION" != "" ];then
  622	  SED_DATA="$((DEBG_POSITION+1)),$((DEBG_POSITION+2))p"
  623	  DEBG_CODE=`echo "$DEBUG_STATUS" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'`
  624	  TBL_CODE2=`echo "$DEBUG_STATUS" | sed -n $SED_DATA`
  625	
  626	  IS_AGENT_DOWN=`echo "$DEBG_CODE" | grep DOWN`
  627	
  628	  if [ "$IS_AGENT_DOWN" != "" ];then
  629	    ReportToTeam "phedex_down" "Phedex(debug) agent down"
  630	    WriteLn "Phedex(debug) agents down."
  631	  else
  632	    WriteLn "Phedex(debug) agents are OK."
  633	  fi
  634	fi
  635	
  636	Write "<table><tr><td>Production Agents</td>$TBL_CODE</tr><tr><td>Debug Agents</td>$TBL_CODE2</tr></table>"
  637	
  638	CloseMail
  639	WriteStatusPage
  640	
  641	
  642	# Se houver algum alerta enviar para a lista de administradores
  643	# do cluster
  644	if [ "$MSG" != "" ];then
  645	  MSG="$MSG $FOOTER"
  646	  SendMsgToTeam
  647	fi
  648	
  649	if [ "$SEND_MAIL" = "Y" ];then
  650	  SendMail $MAIL_BODY $MAILFROM $MAILTO $CARBON_COPY "$MAILSUBJECT"
  651	fi
  652	
  653	rm -frv $MAIL_BODY

Teste

-- JadirSilva - 26 Sep 2008 Outra pagina para teste

Edit | WYSIWYG | Attach | Printable | Raw View | Backlinks: Web, All Webs | History: r6 < r5 < r4 < r3 < r2 | More topic actions
 
Home
This site is powered by the TWiki collaboration platformCopyright © by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding TWiki? Send feedback