#!/bin/bash # # dssgcktopology - check for consistent configuration topology (enclosures, cabling, drives) # # Copyright 2017-2022 Lenovo. All rights reserved. # License: Subject to terms of Lenovo License Agreement # readonly N=$(basename "$0") # script name readonly T=$(date "+%Y-%m-%d.%H%M%S") # time stamp readonly F="$N"."$T"."$$" # unique filename readonly L=/var/log/dssg/"$F" # log basename log() { cat <<< "$@"; } run() { log "RUN: $@"; eval "$@"; } WARN() { log "${dssgcolorwarning}Warning: ${@}${dssgcolornormal}"; } ERROR() { local retval="$?"; [ "$retval"=0 ] && retval=1; log "${dssgcolorerror}ERROR: $@ (exit status $retval)${dssgcolornormal}"; exit "$retval"; } abort() { { printf "\n"; ERROR "User abort"; } > /dev/tty; } cleanup() { rm -f "$L".*; } trap abort INT TERM dssgfuncs=$(dirname $(readlink -f "$0"))/dssgfuncs.sh . "$dssgfuncs" || ERROR "Cannot source $dssgfuncs" dssgisroot || exit 2 # not root dssgsetcolors mkdir -p /var/log/dssg || ERROR "Cannot create directory /var/log/dssg" ##### main() { # usage local -r purpose=" Purpose: Check for a consistent topology within DSS-G building block(s) " local -r usage=" Usage: $N [-h] nodes -h, --help show this help nodes noderange/group or comma-separated node list " # init options # parse options # getopt reorders the command-line: options -- arguments local cmdline; # do not assign; getopt's return status is otherwise masked out by local's one cmdline=$(getopt -n "${dssgcolorwarning}Warning" -o 'h' --long help -- "$@") || { echo "${dssgcolornormal}$usage"; ERROR "Invalid and/or missing argument(s)"; } eval set -- "$cmdline"; # replace command line log "Parsing options: $@" while true ; do case "$1" in -h|--help) echo "${purpose}$usage"; exit 255 ;; --) shift; break ;; *) break ;; esac done cmdline="$@" # nodes are either a noderange/group or a comma-separated node list (( "$#" > 1 )) && WARN "Ignoring extra parameter(s): ${cmdline#* }" # obtain configurations dssgconfigs "$1" "$L" 3 2 0 1 # node list, log basename, allowed configs: 0=skipcheck 1=G100 2=G2xy 3=both 4=exclusive_or 5=check server moddels only, check for node count, check for active mmsfd, obtain 1=topologies 2=configs only local -r g2xy=( $(cat "$L".G2xy.nodes) ) local -r g100=( $(cat "$L".G100.nodes) ) log log "Checking configuration topology..." local errors=0 local configs=0 # process each DSS-G2xy building block for pair in "${g2xy[@]}" ; do local warn=0 local regex="${pair/,/: |}: " grep -E "$regex" "$L".G2xy.top > "$L"."$pair".top grep -E "$regex" "$L".allnodes.topsum > "$L"."$pair".topsum log log log "********** Processing building block with $pair **********" log log "Summary of the configuration found:" grep -E 'GNR.*enclosures|GNR.*topo|GNR.*conf' "$L"."$pair".topsum | dssgxcoll -n log "Checking for number of servers..." local servers=( $(tr ',' ' ' <<< "$pair") ) if (( "${#servers[@]}" < 2 )) ; then WARN "No matching node for $pair" let warn++ else log "Success: found ${#servers[@]} servers" fi log checkenclosures || let warn+="$?" checkcabling || let warn+="$?" checkdrives || let warn+="$?" checksizes || let warn+="$?" validatedrives || let warn+="$?" log "********** $( ((warn)) && echo "${dssgcolorwarning}Found $warn type(s) of issue" || echo "${dssgcolorgood}Done" ) with $pair${dssgcolornormal} **********" let errors+=warn ((warn)) && let configs++ done # building block # process ECE configuration for ece in "${g100[@]}"; do local warn=0 local pair=G100 # pair must be defined for the check* functions below log log log "********** Processing DSS-G100 configuration **********" log log "Summary of the configuration found:" local regex=$(tr ',' '|' <<< "$ece") grep -E "$regex" "$L".allnodes.topsum > "$L"."$pair".topsum grep -E 'GNR.*enclosures|GNR.*topo|GNR.*conf' "$L"."$pair".topsum | dssgxcoll -n log "Checking for number of servers..." local servers=( $(tr ',' ' ' <<< "$ece") ) if (( "${#servers[@]}" < 6 )) ; then WARN "at least 6 nodes should be specified for a DSS-G100 configuration regarding performance and server fault tolerance; found ${#servers[@]}" let warn++ else log "Success: found ${#servers[@]} servers" fi log log "Checking for number of NVMe drives..." local drives=( $(grep "sees.*disk" "$L"."$pair".topsum | sed 's,.*sees\s*\([0-9]\+\)\s*.*,\1,') ) local ndrives=0 for i in "${drives[@]}"; do let ndrives+="$i"; done if (( ndrives < 12 )) ; then WARN "A total of at least 12 NMVe drives is required; found $ndrives" let warn++ else log "Success: found a total of $ndrives drives" fi log log "Checking for a balanced configuration..." local unique=( $(printf "%d\n" "${drives[@]}" | sort -u) ) if (( "${#unique[@]}" > 1 )); then WARN "Unbalanced drive configuration: $(echo ${unique[@]} | tr ' ' ,)" let warn++ else log "Success: found ${unique[@]} drive(s) per server" fi log checkdrives || let warn+="$?" checksizes || let warn+="$?" # validatedrives || let warn+="$?" # For ECE not yet ready to check firmwareTable.drive log "********** $( ((warn)) && echo "${dssgcolorwarning}Found $warn type(s) of issue" || echo "${dssgcolorgood}Done" ) with DSS-G100 configuration${dssgcolornormal} **********" let errors+=warn ((warn)) && let configs++ done log ((errors)) && ERROR "Found $errors type(s) of issue to be resolved in $configs configuration(s)" log "${dssgcolorgood}All done${dssgcolornormal} - see ${L}.log" return 0 } # main ##### checkdrives() { local warn=0 log "Checking for drive issues..." grep -i "Location.*but should" "$L"."$pair".topsum > "$L"."$pair".drives grep -i "Location.*disk size" "$L"."$pair".topsum >> "$L"."$pair".drives if test -s "$L"."$pair".drives ; then let warn++ WARN "The following problems related to drive issues were detected:" cat "$L"."$pair".drives | sort -k3 | dssgxcoll -n else log "Success: no drive issue detected" log fi return "$warn" } ##### checksizes() { local warn=0 log "Checking for drive capacities..." # sequential array of drives with capacity, e.g. # 1,38Y0KBA,1-19,naa.5000C50083E027E3,HDD=6000 # 1,38Y0KBA,1-1,naa.5002538A475B6460,SSD=800 # 1,38Y0KBA,1-20,naa.5000C50084EA669B,HDD=6000 servers="$pair" # for displaydrive local -r drives=( $(grep ': [^:]*:0:' "$L"."$pair".top | grep -v RAID | awk -F: ' { loc=(length($18) ? $18 : $1) # enclosure: S/N (=external) or server name (=internal) slot=$20 # drive slot split($5, s, /\|/) type=(s[7]+0 ? "HDD" : "SSD") # drive RPM value (zero for SSD) print loc "," slot "," $6 "," type "=" int($15/1e10)*10 # number,(enclosure_sn|server),slot,wwn,type=capacity (rounded to 10GB then 1GB) }' | sed "$(displaydrive)" | sort -u) ) # arrays of SSD and HDD capacities local -r ssdcap=( $(printf "%s\n" "${drives[@]}" | grep SSD | cut -d= -f2 | sort -u -n) ) local -r hddcap=( $(printf "%s\n" "${drives[@]}" | grep HDD | cut -d= -f2 | sort -u -n) ) local -r logtipbackup="^1,.*(00|23|1-01|2-14).*SSD" # logTipBackup SSD locations in first enclosure # DSS-G2xy: either HDDs and 2+ SSDs (logTipBackup and/or hybrids) or SSDs only of the same capacity (DSS-G20y, no logTipBackup) # DSS-G100: SSDs only of the same capacity ssdwarn=0 logtipbackupwarn=0 if (( "${#ssdcap[@]}" > 2 )); then # more than 2 different SSD capacities ssdwarn=1 # definitely an issue elif (( "${#ssdcap[@]}" > 1 )); then # likely a hybrid, look for logTipBackup SSDs in first enclosure local found=0 for cap in "${ssdcap[@]}"; do local ndrives=$(printf "%s\n" "${drives[@]}" | grep -E "$logtipbackup=$cap" | wc -l) ((ndrives == 2)) && found=1 && break # found logTipBackup SSDs done ((!found)) && ssdwarn=1 && logtipbackupwarn=1 # looks like logTipBackup SSDs are of different capacities fi # found discrepancy if (( "${#hddcap[@]}" > 1 || ssdwarn )); then let warn++ local majhdd=0 majhddcap=0 majssd=0 majssdcap=0 WARN "Found non-uniform capacities for SSDs and/or HDDs; see details below" # determine majority of HDDs / SSDs and their capacity local colsright=$(column -tR1 <<< "" 2> /dev/null && echo "-R2,6") # column supports -R option { for cap in "${hddcap[@]}"; do local ndrives=$(printf "%s\n" "${drives[@]}" | grep HDD="$cap" | wc -l) log "Found $ndrives HDD(s) of size $cap GB" # display capacity (( ndrives >= majhdd )) && majhdd="$ndrives" && majhddcap="$cap" done for cap in "${ssdcap[@]}"; do local ndrives=$(printf "%s\n" "${drives[@]}" | grep SSD="$cap" | wc -l) log "Found $ndrives SSD(s) of size $cap GB" # display capacity (( ndrives >= majssd )) && majssd="$ndrives" && majssdcap="$cap" done } > >(column -t -o' ' $colsright) # do not use pipe sync # display deviating drives colsright=$(column -tR1 <<< "" 2> /dev/null && echo "-R2") # column supports -R option if (( "${#hddcap[@]}" > 1 )); then log "-------------------------------------------------------------------------------" log "The following HDD(s) deviate from the prevailing $majhddcap GB capacity:" { log "#drive(encl_number,encl_SN,slot,wwn,type)=size(GB)" printf "%s\n" "${drives[@]}" | grep HDD | grep -Ev "=$majhddcap" # BEWARE: requires non-null majhddcap } | column -s= -t $colsright fi if ((ssdwarn)); then log "-------------------------------------------------------------------------------" log "The following SSD(s) deviate from the prevailing $majssdcap GB capacity:" { log "#drive(encl_number,encl_SN,slot,wwn,type)=size(GB)" printf "%s\n" "${drives[@]}" | grep SSD | grep -Ev "=$majssdcap|$logtipbackup" # BEWARE: requires non-null majssdcap } | column -s= -t $colsright fi if ((logtipbackupwarn)); then log "-------------------------------------------------------------------------------" log "The logTipBackup SSDs in the first enclosure have different capacities:" { log "#drive(encl_number,encl_SN,slot,wwn,type)=size(GB)" printf "%s\n" "${drives[@]}" | grep -E "$logtipbackup" } | column -s= -t $colsright fi else log "Success: no discrepancy detected" fi log return "$warn" } ##### validatedrives() { local warn=0 local -r fwtable=/opt/lenovo/dss/firmware/drive/firmwareTable.drive log "Validating the product name and FRU of the drives..." local nodes="$pair" test x"$pair" = x"G100" && nodes=$(echo "${ece[@]}" | tr ' ' ,) dssgxdsh "$nodes" "$(typeset -f; set | grep ^dssgcolor); getunmatched /tmp/${F}.top $fwtable" > "$L"."$pair".drives-unmatched if test -s "$L"."$pair".drives-unmatched ; then let warn++ WARN "The following drive(s) are not referenced in $fwtable on the DSS-G servers:" log "# Enclosure id, slot number, vendor, product name, FRU, firmware level" cat "$L"."$pair".drives-unmatched | sort -n | dssgxcoll -n else log "Success: no unmatched drive detected" log fi return "$warn" } ##### getunmatched() { local -r topo="$1" local -r fwtab="$2" test ! -f "$topo" && echo "${topo}: file not found" && return 1 test ! -f "$fwtab" && echo "${fwtab}: file not found" && return 1 awk ' # return a new string where all %nn sequences (with nn an hexadecimal character code) # are replaced by the corresponding ascii character; e.g. "%20" gets converted into " " function convert(str) { s=str # input string r="" # result string while(1) { if(match(s, "%[A-Fa-f0-9]{2}")) { r=sprintf("%s%s%c", r, substr(s,1,RSTART-1), strtonum("0x" substr(s, RSTART+1, RLENGTH-1))) s=substr(s, RSTART+RLENGTH) } else { r=sprintf("%s%s", r, s) break } } return r } # convert { if(NR==FNR && length(gensub(" |#.*", "", "g", $0)) && $7) { # non-empty lines (comments removed) table[convert($1)","convert($7)]=1 # pick product id and FRU from the firmware table } else { # parse the output from mmgetpdisktopology split($0, a, ":") typ=convert(a[2] ); # scsi type vid=convert(a[9] ); # vendor pid=convert(a[10]); # product fw=convert(a[11]); # firmware fru=convert(a[13]); # FRU enc=convert(a[17]); # enclosure slt=convert(a[19]); # slot if(typ != "0" || pid~/RAID/ || slt=="") next # skip anything but enclosure drive if(! (pid","fru in table)) { if(match(slt,"-")) printf "%s, %-4s, %s, %s, %s, %s\n", enc, length(slt)==3 ? gensub("-","-0",1,slt) : slt, vid, pid, fru, fw # D3284 else printf "%s, %02d, %s, %s, %s, %s\n", enc, slt, vid, pid, fru, fw # D1224 } } }' "$fwtab" "$topo" } ##### checkenclosures() { local warn=0 log "Checking for enclosure issues..." # enclosures/IOMs are off grep -E 'No.*GNR|NOTFOUND|IOM.*not found' "$L"."$pair".topsum > "$L"."$pair".enclosures if test -s "$L"."$pair".enclosures ; then let warn++ WARN "Cannot find IOM(s) or drive enclosure(s):" cat "$L"."$pair".enclosures | dssgxcoll -n else log "Success: no enclosure issue detected" log fi return "$warn" } ##### checkcabling() { local warn=0 log "Checking for cabling issues..." grep -i "Enclosure.*undetermined" "$L"."$pair".topsum > "$L"."$pair".cabling grep -i "HBA to enclosure cabling" "$L"."$pair".topsum >> "$L"."$pair".cabling grep -i "appears only on the.*path" "$L"."$pair".topsum >> "$L"."$pair".cabling if test -s "$L"."$pair".cabling ; then let warn++ WARN "The following problems related to cabling issues were detected:" cat "$L"."$pair".cabling | dssgxcoll -n WARN "Check that cabling is done properly between the DSS-G servers and the storage enclosure(s)." else log "Success: no cabling issue detected" fi log return "$warn" } ##### map enclosure S/N with its number and reformat drive slot; output sed commands displaydrive() { local e # build "enclosure_serial=enclosure_number" for all enclosures from combined topsummary local -r enclosures=$(cat "$L"."$servers".topsum | grep -E 'Enclosure.*number' | sed 's,.*Enclosure\s*\([^\s]*\)\s\+(.*number\s*\([0-9]*\).*,\1=\2,' | LC_ALL=C sort -u) declare -A encnum if test x"$enclosures" = x"internal=1" ; then # DSS-G100 local i=1 for e in $(tr ',' ' ' <<< "${g100[@]}") ; do encnum["$e"]=$(awk '{printf "%02d",$1}' <<< "$i") # encnum[node]=index let i++ done else for e in $enclosures ; do encnum[${e%%=*}]=${e##*=} # encnum[enclosure_serial]=enclosure_number done fi # replace "encl_serial,slot,wwn,type" with "encl_number,encl_serial,slot2digits,wwn,type" for e in "${!encnum[@]}"; do printf "s/$e,\([0-9]\)-\([0-9]\),/$e,\\\1-0\\\2,/ ; " # e.g. "1-4" becomes "1-04" (D3284) printf "s/$e,\([0-9]\),/$e,0\\\1,/ ; " # e.g. "2" becomes "02" (D1224) printf "s/$e/${encnum[$e]},$e/ ; " # prefix enclosure serial with its number done } ##### main "$@" 2>&1 | tee >(dssgstripcolors > "$L".log) # strip colors in log file retval="${PIPESTATUS[0]}" ((retval==255)) && cleanup && retval=0 exit "$retval"