1 # SCRIPT: Cluster Monitoring Script
2 # Date..: Mon 20 Oct 2008 02:48:13 PM BRST
3 # Author: Jadir Marra da Silva<jadir.silva13@gmail.com>
4 #
5 #####################################################
6 # Jadir Silva:
7 # Mon 02 Nov 2008
8 # + acrescentado o teste para verificar o espaco no /scratch
9 # do nodes conforme sugerido por Sergio Lietti.
10
11 # Jadir Silva:
12 # Mon 03 Nov 2008 09:47:20 AM BRST
13 # + acrescentado uma definicão condicional para MAILTO,
14 # MAILSUBJECT e CARBON_COPY para evitar o envio de emails
15 # quando estiver depurando o script
16
17 # Jadir Silva:
18 # Mon 10 Nov 2008 11:21:26 AM BRST
19 # + acrescentado alerta via email para o site verify,
20
21 # Jadir Silva:
22 # Tue 11 Nov 2008 09:09:13 AM BRST
23 # + acrescentado comando para remover o arquivo temporario com
24 # o conteudo do email de alerta.
25
26 # Jadir Silva:
27 # Tue 11 Nov 2008 12:43:04 PM BRST
28 # + alterado o IDLE_THRESHOLD de 699 para 999
29 # seguindo orientacao do Lietti.
30
31 # Jadir Silva:
32 # Thu 27 Nov 2008 10:22:34 AM BRST
33 # + correcao de pequeno problema que impedia
34 # o envio de email com numero de nodes com o condor down.
35
36 TIER="T2_BR_SPRACE"
37 SEND_MAIL="N"
38 GRID_USER="mdias"
39 MAIL_BODY=`mktemp /tmp/site_verify.XXXXXXX`
40 MAILFROM="root@osg-ce.sprace.org.br"
41 MAILTO="sprace_ops@googlegroups.com"
42 CARBON_COPY="jadir.silva13@gmail.com"
43 MAILSUBJECT="SPRACE - Monitoramento Automatico - `date`"
44
45 STATUS_PAGE="/var/www/html/spracemon.html"
46 MAIN_SERVERS="acs.grid osgce.grid osgse.grid storage01.grid storage02.grid"
47 LOAD_THRESHOLD=9
48 SERVERS_LOAD_THRESHOLD=14
49 POOL_THRESHOLD=90
50 IDLE_THRESHOLD=999
51 #GANGLIA_LINK='http://prod-frontend.hepgrid.uerj.br/ganglia/'
52 GANGLIA_LINK='http://osg-ce.sprace.org.br/ganglia'
53 DCACHE_URL="http://osg-se.sprace.org.br:2288"
54 #DCACHE_URL="http://cdfdca.fnal.gov:2288/cellInfo"
55
56 # Captura uma lista com todos os nodes do cluster
57 NODE_LIST=`links -source $GANGLIA_LINK | grep 'OPTION.*\.grid' | sed 's/<[^>]*>/ /g'`
58 NODE_LIST=`echo $NODE_LIST | sed 's/\.grid//g'`
59 NODE_LIST=`echo $NODE_LIST | sed 's/osgce//g;s/storage01//g;s/storage02//g;s/osgse//g;s/acs//g'`
60
61 NODE_LIST=`cat /root/bin/cluster.list`
62
63 PHEDEX_PROD_URL="http://cmsweb.cern.ch/phedex/prod/Components::Status"
64 PHEDEX_DEBG_URL="http://cmsweb.cern.ch/phedex/debug/Components::Status"
65
66 MSG=""
67 FOOTER="<br /><font size=2>This is an automatic email, please do not reply</font><br />Message send in `date`"
68
69 function Header(){
70 echo "<h2>$1</h2" >> $MAIL_BODY
71 echo "<hr />" >> $MAIL_BODY
72 }
73
74 function OpenTable(){
75 echo "<table size=100%>" >> $MAIL_BODY
76 }
77
78 function CloseTable(){
79 echo "</table>" >> $MAIL_BODY
80 }
81
82 function OpenTbLine(){
83 echo "<tr>" >> $MAIL_BODY
84 }
85
86 function CloseTbLine(){
87 echo "</tr>" >> $MAIL_BODY
88 }
89
90 function OpenCell(){
91 echo '<td valign="top">' >> $MAIL_BODY
92 }
93
94 function CloseCell(){
95 echo "</td>" >> $MAIL_BODY
96 }
97
98 function Link(){
99 echo "For details "'<a href="'$1'">click here</a>' >> $MAIL_BODY
100 }
101
102 function WriteLn(){
103 echo "$1<br />" >> $MAIL_BODY
104 }
105
106 function Write(){
107 echo "$1 " >> $MAIL_BODY
108 }
109
110 function WriteStatusPage(){
111 cp $MAIL_BODY $STATUS_PAGE
112 }
113
114 function InitMail(){
115 echo "<?xml version=\"1.0\"?>" >> $MAIL_BODY
116 echo "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" >> $MAIL_BODY
117 echo "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns:exsl=\"http://exslt.org/common\" lang=\"en\" xml:lang=\"en\">" >> $MAIL_BODY
118 echo "<head>" >> $MAIL_BODY
119 echo "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />" >> $MAIL_BODY
120 echo "<link rel=\"stylesheet\" type=\"text/css\" href=\"http://dashb-cms-sam.cern.ch/dashboard/templates/css/samvisualization.css\" />" >> $MAIL_BODY
121 echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/dojo-0.4.1rc3-ajax/dojo.js\"></script>" >> $MAIL_BODY
122 echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/overlibmws/overlibmws.js\"></script>" >> $MAIL_BODY
123 echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/overlibmws/overlibmws_iframe.js\"></script>" >> $MAIL_BODY echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/overlibmws/overlibmws_hide.js\"></script>" >> $MAIL_BODY
124 echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/sam/common.js\"></script>" >> $MAIL_BODY
125 echo "<script type=\"text/javascript\" src=\"http://dashb-cms-sam.cern.ch/dashboard/templates/js/sam/latestResultsView.js\"></script>" >> $MAIL_BODY
126 echo "<title>SPRACE Monitoring script</title>" >> $MAIL_BODY
127 echo "</head>" >> $MAIL_BODY
128 echo "<html>" >> $MAIL_BODY
129 echo " <h1>SPRACE Monitoring Report</h1>" >> $MAIL_BODY
130 echo " <br /><br /><h2>Test start at `date`</h2>" >> $MAIL_BODY
131 echo " <body style=\"background-image: url(http://osg-ce.sprace.org.br/banner-unico-aa.png);background-repeat: no-repeat\">" >> $MAIL_BODY
132 }
133
134 function CloseMail(){
135 echo "<h2>Test done at `date`</h2>" >> $MAIL_BODY
136 echo "<br />" >> $MAIL_BODY
137 echo "Report generated by monitor.sh script, developed by Jadir Silva with support of Allan Szu<br>" >> $MAIL_BODY
138 echo "and some suggestions from Sergio Lietti following steps defined by Marco Dias in " >> $MAIL_BODY
139 echo "<a href='http://www.sprace.org.br/Twiki/bin/view/Main/EntryDescriptionNo62'>[1]</a>." >> $MAIL_BODY
140 echo "<br /><br /><font size=2>Obs.: This script still under development, if you have any opinion,<br />" >> $MAIL_BODY
141 echo "contact me at jadir.silva13@gmail.com</font>" >> $MAIL_BODY
142 echo " </body>" >> $MAIL_BODY
143 echo "</html>" >> $MAIL_BODY
144 }
145
146 function SendMail(){
147 cat - $1 <<HERE | /usr/sbin/sendmail -oi -t
148 From: $2
149 To: $3
150 Cc: $4
151 Subject: $5
152 Content-Type: text/html; charset=us-ascii
153 Content-Transfer-Encoding: 7bit
154 MIME-Version: 1.0
155
156 HERE
157 }
158
159 SendMsgToTeam(){
160 if [ "`basename $0`" == "monitor_debug.sh" ];then
161 MAILSUBJECT="SPRACE - Debug"
162 CARBON_COPY="jadirmarra@yahoo.com.br"
163 MAILTO="jadir.silva13@gmail.com"
164 fi
165
166 ReportMail=`mktemp /tmp/ReportMail.XXXXXX`
167 echo "$MSG" > $ReportMail
168 SendMail "$ReportMail" "$MAILFROM" "$MAILTO" "$CARBON_COPY" "$MAILSUBJECT"
169 #rm -fr $ReportMail
170 }
171
172 function ReportToTeam(){
173 case "$1" in
174 idle)
175 MSG="$MSG<hr /><h2>Lot of Jobs in idle state</h2><br>
176 There are $2 in idle state on the farm.<br>"
177 ;;
178 sam-error)
179 MSG="$MSG<hr /><h2>Error on SAM test</h2><br>
180 The SAM test presents some errors $2.<br>"
181 ;;
182 dcache)
183 MSG="$MSG<hr /><h2>Error in some services of dcache</h2><br>
184 DCache have $2 stopped services.<br>"
185 ;;
186 dpool)
187 MSG="$MSG<hr /><h2>Low space on some pools in dcache</h2><br />
188 DCache has some pools with $2 of space used.<br>"
189 ;;
190 server-load)
191 MSG="$MSG<hr /><h2>Load of main servers</h2><br />
192 $2 beyond $SERVERS_LOAD_THRESHOLD.<br>"
193 ;;
194 node-load)
195 MSG="$MSG<hr /><h2>Load dos nodes</h2><br />
196 $2 beyond $LOAD_THRESHOLD.<br />"
197 ;;
198 node_down)
199 MSG="$MSG<hr /><h2>Node Down</h2><br />
200 $2 down.<br>$FOOTER"
201 ;;
202 condor_down)
203 MSG="$MSG<hr /><h2>Condor Down</h2><br />
204 $2 with condor stopped.<br>"
205 ;;
206 low_disk)
207 MSG="$MSG<hr /><h2>Low space on scratch in following nodes</h2><br />
208 $2 <br />"
209 ;;
210 job_robot)
211 MSG="$MSG<hr /><h2>JobRobot with low efficiency</h2><br />
212 $2% <br />"
213 ;;
214 old_jobs)
215 MSG="$MSG<hr /><h2>Jobs more than 2 days on the farm</h2><br />
216 $2 <br />"
217 ;;
218 site_verify)
219 MSG="$MSG<hr /><h2>Site verify failed.</h2><br />
220 $2 <br />"
221 ;;
222 phedex_down)
223 MSG="$MSG<hr /><h2>Phedex Agents status.</h2><br />
224 $2 <br />"
225 ;;
226 esac
227 }
228
229 function JobRobotTest(){
230 JOB_ROBOT_URL1="http://belforte.home.cern.ch/belforte/JobRobot/summary_"`date '+%y%m%d' -d "$1 day ago"`".html"
231 JOBROBOT1=`links -source $JOB_ROBOT_URL1`
232 POSITION=`echo "$JOBROBOT1" | grep -n '<td align=left><b> T2_BR_SPRACE' | awk -F: '{print $1}'`
233
234 if [ "$POSITION" != "" ];then
235 SED_DATA="$POSITION,$((POSITION+5))p"
236 JOBROBOT1=`echo "$JOBROBOT1" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'`
237 EFICIENCIA=`echo $JOBROBOT1 | awk '{print $6}'`
238
239 if [ "$EFICIENCIA" == "" ];then
240 WriteLn "Efficiency : -- -- --"
241 else
242 if [ $EFICIENCIA -lt 60 ];then
243 if [ "$1" == "1" ];then
244 ReportToTeam "job_robot" "$EFICIENCIA"
245 fi
246 EFI="Efficiency : $EFICIENCIA% <font color=#ff0000>(below expected)</font> (Test done at "`date '+%d/%m/%Y' -d "$1 day ago"`")."
247 WriteLn "$EFI"
248 else
249 EFI="Efficiency : $EFICIENCIA% Ok (Test done at "`date '+%d/%m/%Y' -d "$1 day ago"`")."
250 WriteLn "$EFI"
251 fi
252 fi
253 else
254 WriteLn "Efficiency : -- -- --"
255 fi
256 }
257
258 # inicializa o ambiente do OSG
259 source /OSG/setup.sh
260
261 InitMail
262 #####################################################
263 # espaco reservado para testes nao digite nada aqui
264
265 ######################################################
266
267 # PASSO 1. verifica quantos nodes estao down
268 Header "Hosts down"
269
270 HDOWN=`links -source $GANGLIA_LINK | grep 'class=down' | sed 's/<[^>]*>/ /g' | awk '{ print $1,"|" }'`
271 HDOWN=`echo $HDOWN | sed 's/\.grid//g'`
272
273
274 UP_LIST=$NODE_LIST
275 OLD_IFS=$IFS
276
277 if [ "$HDOWN" == "" ];then
278 WriteLn "No hosts down."
279 else
280 IFS='|'
281 for host in $HDOWN;do
282 IFS=$OLD_IFS
283 NODE_NAME=`echo $host | awk '{print $1}'`
284 IFS='|'
285 Write "$NODE_NAME "
286 UP_LIST=`echo $UP_LIST | sed "s/$NODE_NAME/ /g"`
287 Link "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$NODE_NAME.grid"
288 Write "<br />"
289 done
290 ReportToTeam "node_down" "$HDOWN"
291 fi
292
293
294 IFS=$OLD_IFS
295
296 # PASSO 2. verifica os host's que estao load > 10
297 Header "Hosts with load equal/above $((LOAD_THRESHOLD+1))"
298 NADA=0
299 LOADS=""
300 for a in $UP_LIST;do
301 node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g'`
302 LOAD=`echo $node_load | awk '{print $1}'`
303 LOAD=`echo $LOAD | sed 's/\.[0-9][0-9]//g'`
304 if [ $LOAD -gt $LOAD_THRESHOLD ];then
305 WriteLn "$a load : $LOAD"
306 LOADS="$LOADS <BR /> $a(load=$LOAD)"
307 NADA=1
308 fi
309 done
310
311 if [ $NADA == 0 ];then
312 WriteLn "No host with load equal/above $((LOAD_THRESHOLD+1))."
313 NADA=0
314 else
315 ReportToTeam "node-load" "$MSG"
316 fi
317
318 NADA=0
319 Header "Load of main servers <br /><font size=2>$MAIN_SERVERS</font>"
320
321 # PASSO 3. Load acima de 15 nos principais servidores.
322 LOAD_NODE=""
323 for a in $MAIN_SERVERS ;do
324 # node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g' | grep '[1-9][5-9]\.'`
325 node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g'`
326 LOAD=`echo $node_load | awk '{print $1}'`
327 LOAD=`echo $LOAD | sed 's/\.[0-9][0-9]//g'`
328 if [ $LOAD -gt $SERVERS_LOAD_THRESHOLD ];then
329 LOAD_NODE="$a(load=$LOAD)"
330 WriteLn "$a load : $LOAD"
331 NADA=1
332 fi
333 done
334
335
336 if [ $NADA == 0 ];then
337 WriteLn "No host with load equal/above $((SERVERS_LOAD_THRESHOLD+1))."
338 NADA=0
339 else
340 ReportToTeam "server-load" "$LOAD_NODE"
341 fi
342
343 # PASSO 4.
344 # Site verify
345 # executa o script site_verify.pl da OSG.
346 SITE_VERIFY=`su - $GRID_USER -c "source /opt/osg-1.0.0/setup.sh;/opt/osg-1.0.0/verify/site_verify.pl" | grep -i " FAIL"`
347
348 Header "Siteverify.pl status"
349
350 if [ "$SITE_VERIFY" == "" ];then
351 WriteLn "Site verify test: <b>SUCCESS</b>"
352 else
353 WriteLn "Errors founded:"
354 WriteLn "$SITE_VERIFY"
355 ReportToTeam "site_verify" "$SITE_VERIFY"
356 fi
357
358 # verifica se o condor esta rodando em todos os nos
359 Header "Condor status"
360 CRUNNING=0
361 CSTOPPED=0
362 CONDOR_MSG=""
363 for node in $UP_LIST;do
364 IS_RUNNING=`ssh $node ps -fu condor | grep -v UID`
365 if [ "$IS_RUNNING" == "" ];then
366 WriteLn "$node with condor <font color=ff0000>stopped</font>."
367 CSTOPPED=$((CSTOPPED+1))
368 CONDOR_MSG="$CONDOR_MSG <BR />$node"
369 else
370 CRUNNING=$((CRUNNING+1))
371 fi
372
373 done
374
375 if [ "$CSTOPPED" == "0" ];then
376 WriteLn "<B>Condor running on all active nodes</B>"
377 else
378 ReportToTeam "condor_down" "$CONDOR_MSG"
379 WriteLn "Condor running on $CRUNNING nodes and stopped on $CSTOPPED nodes."
380 fi
381
382 Header "Job status"
383 JOBS_STATS=`condor_q | grep running`
384 IDLE_JOBS=`echo $JOBS_STATS | awk '{print $3}'`
385 TOTAL_JOBS=`echo $JOBS_STATS | awk '{print $1}'`
386 RUN_JOBS=`echo $JOBS_STATS | awk '{print $5}'`
387 HELD_JOBS=`echo $JOBS_STATS | awk '{print $7}'`
388
389 OpenTable
390 OpenTbLine
391 OpenCell
392 WriteLn "Running: $RUN_JOBS"
393 if [ $IDLE_JOBS -gt $IDLE_THRESHOLD ];then
394 WriteLn "Idle.......: <b><font color=ff0000>$IDLE_JOBS</font> Warning!!!</b>"
395 ReportToTeam "idle" "$IDLE_JOBS"
396 else
397 WriteLn "Idle.......: $IDLE_JOBS"
398 fi
399 WriteLn "Held.......: $HELD_JOBS"
400 WriteLn "Total......: $TOTAL_JOBS"
401 CloseCell
402 OpenCell
403 WriteLn "  "
404 CloseCell
405 OpenCell
406 WriteLn "If has any job <b>held</b> or more than <b>$((IDLE_THRESHOLD+1))</b> jobs in <b>idle</b><br />Please report to sprace_ops@yahoo.com.br"
407 CloseCell
408 CloseTbLine
409 CloseTable
410
411 Header "Jobs with more than 2 days on the farm"
412 JOBS_RUNNING=`condor_q -run | grep [2-9]+ `
413 if [ "$JOBS_RUNNING" == "" ];then
414 WriteLn "No jobs more than 2 days on the farm"
415 else
416 MORE2DAYS=`echo $JOBS_RUNNING | sed 's/\.grid/\.grid<br \/>/g'`
417 WriteLn "$MORE2DAYS"
418 ReportToTeam "old_jobs" "$MORE2DAYS"
419 fi
420
421 Header "Farm occupation"
422 FarmOcupation=`condor_q -run | grep -v "OWNER" | grep -v "Submitter" | awk '{print $2}' | sort | uniq -c | sed 1d`
423 WriteLn "<pre>$FarmOcupation</pre>"
424
425 Header "SAM test"
426
427 SAM=`links -source "http://dashb-cms-sam.cern.ch/dashboard/request.py/latestresultssmry?siteSelect3=T2T1T0&serviceTypeSelect3=vo&sites=T2_BR_SPRACE&services=CE&services=SRMv2&tests=1301&tests=133&tests=111&tests=6&tests=1261&tests=76&tests=64&tests=20&tests=281&tests=882&exitStatus=all" | \
428 sed -e '/latestresultssmrytable/!d' | \
429 awk '{ print substr($0,index($0,"latestresultssmrytable")) }' |\
430 sed 's/target=\"\_blank\">//g'`
431
432 SAM_LINK='"http://dashb-cms-sam.cern.ch/dashboard/request.py/'$SAM
433 SAM_LINK=`echo $SAM_LINK | sed 's/\"//g'`
434 SAM_TABLE=`links -source $SAM_LINK | sed 's/\/dashboard/http\:\/\/dashb-cms-sam\.cern\.ch\/dashboard/g'`
435 SAM_TABLE=`echo $SAM_TABLE | sed 's/<title>SAM-Latest Results<\/title><\/head><body>//g'`
436 SAM_TABLE=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"<link rel")) }'`
437 SAM_TABLE=`echo $SAM_TABLE | sed 's/<\/body><\/html>'//g`
438 SAM_TABLE=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"<div"))}'`
439 #SAM_TABLE=`echo $SAM_TABLE | sed 's/Service Type/Tipo de Servico/g;s/Service Name/Nome do Servico/g;s/Sitename/Sitio/g'`
440
441 SAM_RED1_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF0000"),24) }' `
442 SAM_RED2_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF6666"),24) }' `
443 SAM_RED3_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF9999"),24) }' `
444
445 SAM_CRIT1_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#CC00CC"),24) }' `
446 SAM_CRIT2_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF33FF"),24) }' `
447 SAM_CRIT3_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF99FF"),24) }' `
448
449 if [ "$SAM_RED1_ERROR" == "background-color:#FF0000" ];then
450 ReportToTeam "sam-error" "Dark red code"
451 fi
452
453 if [ "$SAM_RED2_ERROR" == "background-color:#FF6666" ];then
454 ReportToTeam "sam-error" "Light red code"
455 fi
456
457 if [ "$SAM_RED3_ERROR" == "background-color:#FF9999" ];then
458 ReportToTeam "sam-error" "Lightest red code"
459 fi
460
461 if [ "$SAM_CRIT1_ERROR" == "background-color:#CC00CC" ];then
462 ReportToTeam "sam-error" "Dark critical code"
463 fi
464
465 if [ "$SAM_CRIT2_ERROR" == "background-color:#FF33FF" ];then
466 ReportToTeam "sam-error" "Light critical code"
467 fi
468
469 if [ "$SAM_CRIT3_ERROR" == "background-color:#FF99FF" ];then
470 ReportToTeam "sam-error" "Lightest critical code"
471 fi
472
473 echo "$SAM_TABLE" >> $MAIL_BODY
474
475 # Verifica os servicos do d-cache
476 Header "DCache status"
477
478 CELL_INFO=`links -source $DCACHE_URL/cellinfo`
479 USAG_INFO=`links -source $DCACHE_URL/usageInfo`
480 CELL_STAT=`echo "$CELL_INFO" | sed 's/<[^>]*>/ /g' | grep -i offline | wc -l`
481 USAG_STAT=`echo "$USAG_INFO" | sed 's/<[^>]*>/ /g'`
482
483 if [ $CELL_STAT -gt 0 ];then
484 WriteLn "$CELL_STAT dcache services are stopped"
485 ReportToTeam "dcache" "$CELL_STAT"
486 else
487 WriteLn "All dcache services(<i>daemons</i>) ok.<br />"
488 fi
489
490 LINE=""
491 TOTAL=0
492 FREE=0
493 REPORTAR=""
494 echo "$USAG_INFO" | egrep '(cell|total|free|precious)' | grep '<td' \
495 | grep -v 'layout' | sed 's/<td class=\"//g;s/\">/ /g;s/<\/td>//g' | while read celula valor ;do
496
497 if [ "$celula" == "cell" ];then
498 LINE="$LINE $valor"
499 fi
500
501 if [ "$celula" == "total" ];then
502 TOTAL="$valor"
503 fi
504
505 if [ "$celula" == "free" ];then
506 FREE="$valor"
507 fi
508
509 if [ "$celula" == "precious" ];then
510 PERCENT=`cat - << HERE | bc
511 scale=0
512 100-((100*$FREE)/$TOTAL)
513 HERE`
514
515 if [ $PERCENT -gt $POOL_THRESHOLD ];then
516 REPORTAR="$REPORTAR <br />$LINE with $PERCENT occupation"
517 PERCENT="<font color=#ff0000>$PERCENT%</font>"
518 else
519 PERCENT="$PERCENT%"
520 fi
521
522 WriteLn "$LINE with $PERCENT ocuppation"
523 LINE=""
524 fi
525 done
526
527 if [ "$REPORTAR" != "" ];then
528 ReportToTeam "dpool" "$REPORTAR"
529 fi
530
531 #######################
532 # Lietti sugeriu acrescentar no relatorio o espaco ocupado pelo/scratch dos nodes
533 # entao desenvolvi o codigo abaixo para fazer isto.
534 x=0
535 Header "Ocuppation of /scratch on nodes"
536 WriteLn "<h3>Only nodes less than 8Gb.</h3>"
537 IFS=$OLD_IFS
538 LOW_DISK_NODES=""
539 nodes_to_save=""
540 for node in $UP_LIST;do
541 if [ "$node" = "`/bin/hostname -s`" ]; then
542 eval "df -h /scratch"
543 else
544 saida=`ssh $node "df /scratch"`
545 size=`echo $saida | awk '{print $11}'`
546 perc=`echo $saida | awk '{print $12}'`
547 if [ $size -lt 8388608 ];then
548 x=$(($x+1))
549 if [ $size -lt 1048576 ];then
550 node_disk_space="$node"'('"$(($size/1024))Mb"') '
551 WriteLn "$node_disk_space"
552 LOW_DISK_NODES="$LOW_DISK_NODES $node_disk_space<br />"
553 nodes_to_save="$nodes_to_save $node"
554 else
555 node_disk_space="$node"'('"$(($size/1048576))Gb"') '
556 WriteLn "$node_disk_space"
557 LOW_DISK_NODES="$LOW_DISK_NODES $node_disk_space<br />"
558 nodes_to_save="$nodes_to_save $node"
559 fi
560 fi
561 fi
562 done
563
564 if [ "$LOW_DISK_NODES" != "" ];then
565 ReportToTeam "low_disk" "$LOW_DISK_NODES"
566 echo "$nodes_to_save" > /tmp/nodes_full.txt
567 else
568 WriteLn "No node with low space on /scratch"
569 fi
570
571 WriteLn ""
572
573 Header "JobRobot Status"
574
575 JobRobotTest 1
576 JobRobotTest 2
577 JobRobotTest 3
578 JobRobotTest 4
579 JobRobotTest 5
580 JobRobotTest 6
581
582 if [ "$DEBUG" == "monitor_debug.sh" ];then
583 # codigo de teste ou instavel deve ficar aqui ####################
584 Header "CEMon Status"
585 ldap=`ldapsearch -x -LLL -p 2170 -h is.grid.iu.edu -b mds-vo-name=SPRACE,mds-vo-name=local,o=grid`
586 cods=`ssh node34 "source /OSG/setup.sh;condor_status -pool osg-ress-1.fnal.gov -l -constraint 'GlueCEInfoHostName == \"osg-ce.sprace.org.br\"'"`
587
588 LDAP_STATUS=`echo $ldap | grep `
589
590 # fim da area de teste ###########################################
591 fi
592
593 # status dos agentes do phedex
594 Header "Phedex Agents Status"
595
596 PRODUCTION_STATUS=`links -source $PHEDEX_PROD_URL`
597 DEBUG_STATUS=`links -source $PHEDEX_DEBG_URL`
598
599 PROD_POSITION=`echo "$PRODUCTION_STATUS" | grep -n "$TIER" | awk -F: '{print $1}'`
600 DEBG_POSITION=`echo "$DEBUG_STATUS" | grep -n "$TIER" | awk -F: '{print $1}'`
601
602 TBL_CODE=""
603 TBL_CODE2=""
604
605 if [ "$PROD_POSITION" != "" ];then
606 SED_DATA="$((PROD_POSITION+1)),$((PROD_POSITION+2))p"
607 PROD_CODE=`echo "$PRODUCTION_STATUS" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'`
608 TBL_CODE=`echo "$PRODUCTION_STATUS" | sed -n $SED_DATA`
609
610 IS_AGENT_DOWN=`echo "$PROD_CODE" | grep DOWN`
611
612 if [ "$IS_AGENT_DOWN" != "" ];then
613 ReportToTeam "phedex_down" "Phedex(production) agent down"
614 WriteLn "Phedex(production) agents down."
615 else
616 WriteLn "Phedex(production) agents are OK."
617 fi
618 fi
619
620
621 if [ "$DEBG_POSITION" != "" ];then
622 SED_DATA="$((DEBG_POSITION+1)),$((DEBG_POSITION+2))p"
623 DEBG_CODE=`echo "$DEBUG_STATUS" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'`
624 TBL_CODE2=`echo "$DEBUG_STATUS" | sed -n $SED_DATA`
625
626 IS_AGENT_DOWN=`echo "$DEBG_CODE" | grep DOWN`
627
628 if [ "$IS_AGENT_DOWN" != "" ];then
629 ReportToTeam "phedex_down" "Phedex(debug) agent down"
630 WriteLn "Phedex(debug) agents down."
631 else
632 WriteLn "Phedex(debug) agents are OK."
633 fi
634 fi
635
636 Write "<table><tr><td>Production Agents</td>$TBL_CODE</tr><tr><td>Debug Agents</td>$TBL_CODE2</tr></table>"
637
638 CloseMail
639 WriteStatusPage
640
641
642 # Se houver algum alerta enviar para a lista de administradores
643 # do cluster
644 if [ "$MSG" != "" ];then
645 MSG="$MSG $FOOTER"
646 SendMsgToTeam
647 fi
648
649 if [ "$SEND_MAIL" = "Y" ];then
650 SendMail $MAIL_BODY $MAILFROM $MAILTO $CARBON_COPY "$MAILSUBJECT"
651 fi
652
653 rm -frv $MAIL_BODY
Teste
--
JadirSilva - 26 Sep 2008
Outra pagina para teste