diff --git a/aci-preupgrade-validation-script.py b/aci-preupgrade-validation-script.py index 8d47b755..5e79f56e 100644 --- a/aci-preupgrade-validation-script.py +++ b/aci-preupgrade-validation-script.py @@ -2611,19 +2611,35 @@ def hw_program_fail_check(cversion, **kwargs): @check_wrapper(check_title="Switch SSD Health (F3073, F3074 equipment-flash-warning)") -def switch_ssd_check(**kwargs): +def switch_ssd_check(cversion, tversion, **kwargs): result = FAIL_O - headers = ["Fault", "Pod", "Node", "SSD Model", "% Threshold Crossed", "Recommended Action"] + headers = ["Fault", "Pod", "Node", "SSD Model", "% Threshold Crossed"] data = [] - unformatted_headers = ["Fault", "Fault DN", "% Threshold Crossed", "Recommended Action"] + unformatted_headers = ["Fault", "Fault DN", "% Threshold Crossed"] unformatted_data = [] thresh = {'F3073': '90%', 'F3074': '80%'} - recommended_action = { - 'F3073': 'Contact Cisco TAC for replacement procedure', - 'F3074': 'Monitor (no impact to upgrades)' - } + overall_ra = "" + micron_ra = ( + '\n\tRun the SSD Lifetime Validation script manually on all identified nodes before upgrading.\n' + '\tScript location: https://github.com/datacenter/aci-tac-scripts/tree/main/SSD%20Lifetime%20Validation\n' + ) + fault_ra = "Contact Cisco TAC for replacement procedure" + mixed_ra = ( + "Mixed SSD faults detected:" + "\n\tFor non-Micron SSDs (F3073/F3074 rows): Contact Cisco TAC for replacement procedure.\n" + "\tFor Micron SSD: Run the SSD Lifetime Validation script manually on all identified nodes before upgrading.\n" + "\tScript location: https://github.com/datacenter/aci-tac-scripts/tree/main/SSD%20Lifetime%20Validation\n" + ) + doc_url = "https://datacenter.github.io/ACI-Pre-Upgrade-Validation-Script/validations/#switch-ssd-health" + if not tversion: + return Result(result=MANUAL, msg=TVER_MISSING) + + affected = ['6.1(5e)', '6.2(1g)'] + cver_affected = any(cversion.same_as(v) for v in affected) + tver_affected = any(tversion.same_as(v) for v in affected) + cs_regex = r"model:(?P\w+)," faultInsts = icurl('class', 'faultInst.json?query-target-filter=or(eq(faultInst.code,"F3073"),eq(faultInst.code,"F3074"))') @@ -2632,25 +2648,91 @@ def switch_ssd_check(**kwargs): dn_array = re.search(node_regex, faultInst['faultInst']['attributes']['dn']) cs_array = re.search(cs_regex, faultInst['faultInst']['attributes']['changeSet']) if dn_array and cs_array: + ssd_model = cs_array.group("model") data.append([fc, dn_array.group("pod"), dn_array.group("node"), cs_array.group("model"), - thresh.get(fc, ''), - recommended_action.get(fc, 'Resolve the fault')]) + thresh.get(fc, '')]) else: unformatted_data.append([fc, faultInst['faultInst']['attributes']['dn'], - thresh.get(fc, ''), - recommended_action.get(fc, 'Resolve the fault')]) - if not data and not unformatted_data: - result = PASS - return Result( - result=result, - headers=headers, - data=data, - unformatted_headers=unformatted_headers, - unformatted_data=unformatted_data, - doc_url=doc_url, - ) + thresh.get(fc, '')]) + has_fault_data = bool(data or unformatted_data) + + def collect_micron(classify): + eqptFlashs = icurl('class', 'eqptFlash.json?query-target-filter=eq(eqptFlash.vendor,"Micron")') + if not eqptFlashs: + return False, False + + micron_ssds_per_node = defaultdict(set) + micron_rows = [] + + for eqptFlash in eqptFlashs: + attr = eqptFlash['eqptFlash']['attributes'] + dn = re.search(node_regex, attr.get("dn", "")) + node_id = dn.group("node") + pod_id = dn.group("pod") + model = attr.get('model', '') + + micron_ssds_per_node[node_id].add(model) + micron_rows.append(['CSCwt38698 (False Fault Micron SSD defect)', + pod_id, + node_id, model, 'N/A']) + + if classify: + genuine_faults = [] + micron_false_faults = [] + + for fault_row in data: + node_id = fault_row[2] + ssd_model = fault_row[3] + + is_micron_fault = (node_id in micron_ssds_per_node and ssd_model in micron_ssds_per_node[node_id]) + + if not is_micron_fault: + genuine_faults.append(fault_row) + else: + for micron_row in micron_rows: + if micron_row[2] == node_id and micron_row[3] == ssd_model: + micron_false_faults.append(micron_row) + break + + del data[:] + del unformatted_data[:] + data.extend(genuine_faults) + data.extend(micron_false_faults) + return bool(micron_false_faults), bool(genuine_faults) + else: + data.extend(micron_rows) + return True, False + + if cver_affected: + has_micron_faults, has_genuine_fault = collect_micron(classify=True) + if has_micron_faults: + result = MANUAL + if has_genuine_fault: + overall_ra = mixed_ra + else: + overall_ra = micron_ra + elif has_fault_data: + result = FAIL_O + overall_ra = fault_ra + else: + result = PASS + elif tver_affected: + if has_fault_data: + result = FAIL_O + overall_ra = fault_ra + elif collect_micron(classify=False)[0]: + result = MANUAL + overall_ra = micron_ra + else: + result = PASS + else: + result = FAIL_O if has_fault_data else PASS + if has_fault_data: + overall_ra = fault_ra + + return Result(result=result, headers=headers, data=data, unformatted_headers=unformatted_headers, unformatted_data=unformatted_data, recommended_action=overall_ra, doc_url=doc_url) # Connection Based Check @check_wrapper(check_title="APIC SSD Health") @@ -6967,4 +7049,4 @@ def main(_args=None): msg = "Abort due to unexpected error - {}".format(e) prints(msg) log.error(msg, exc_info=True) - sys.exit(1) + sys.exit(1) \ No newline at end of file diff --git a/docs/docs/validations.md b/docs/docs/validations.md index 85588880..f7886811 100644 --- a/docs/docs/validations.md +++ b/docs/docs/validations.md @@ -810,6 +810,12 @@ See the [ACI Switch Node SSD Lifetime Explained technote][9] for more details. --- omit --- ``` +Due to [CSCwt38698][76], Micron SSDs present in the fabric may give false end-of-life failures after upgrading to 6.1(5e) or 6.2(1g). + +To confirm if this is genuine or false alarm, run the SSD Lifetime Validation script on all nodes with identified actual failure case. If the SSD lifetime is critically low after manually running the script, you have to follow the SSD replacement procedure outlined in the field notice to ensure that the node remains available after the upgrade. To avoid this false alarm you can choose non-impacted target version. + +- Script location: [SSD Lifetime Validation](https://github.com/datacenter/aci-tac-scripts/tree/main/SSD%20Lifetime%20Validation) + ### Config On APIC Connected Port @@ -2916,3 +2922,5 @@ Contact Cisco TAC for next steps. For more details, refer to the workaround in [ [73]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwo74485 [74]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwm42741 [75]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwt69100 +[76]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwt38698 + diff --git a/tests/checks/switch_ssd_check/eqptFlash_mixed_node.json b/tests/checks/switch_ssd_check/eqptFlash_mixed_node.json new file mode 100644 index 00000000..9440e4fd --- /dev/null +++ b/tests/checks/switch_ssd_check/eqptFlash_mixed_node.json @@ -0,0 +1,12 @@ +[ + { + "eqptFlash": { + "attributes": { + "dn": "topology/pod-1/node-205/sys/ch/supslot-2/sup/flash", + "model": "Micron_M600_MTFDDAT064MBF", + "vendor": "Micron", + "ser": "MSA20400892" + } + } + } +] \ No newline at end of file diff --git a/tests/checks/switch_ssd_check/eqptFlash_multi_micron.json b/tests/checks/switch_ssd_check/eqptFlash_multi_micron.json new file mode 100644 index 00000000..c88aa8fa --- /dev/null +++ b/tests/checks/switch_ssd_check/eqptFlash_multi_micron.json @@ -0,0 +1,23 @@ +{ + "totalCount": "2", + "imdata": [ + { + "eqptFlash": { + "attributes": { + "dn": "topology/pod-1/node-205/sys/ch/supslot-1/sup/flash", + "vendor": "Micron", + "model": "Micron_M550_MTFDDAT256MAY" + } + } + }, + { + "eqptFlash": { + "attributes": { + "dn": "topology/pod-1/node-101/sys/ch/supslot-1/sup/flash", + "vendor": "Micron", + "model": "Micron_M600_MTFDDAT064MBF" + } + } + } + ] +} \ No newline at end of file diff --git a/tests/checks/switch_ssd_check/eqptFlash_single_micron_noFault.json b/tests/checks/switch_ssd_check/eqptFlash_single_micron_noFault.json new file mode 100644 index 00000000..691f8ad5 --- /dev/null +++ b/tests/checks/switch_ssd_check/eqptFlash_single_micron_noFault.json @@ -0,0 +1,14 @@ +{ + "totalCount": "1", + "imdata": [ + { + "eqptFlash": { + "attributes": { + "dn": "topology/pod-1/node-103/sys/ch/supslot-1/sup/flash", + "vendor": "Micron", + "model": "MTFDDAK240MBB" + } + } + } + ] +} \ No newline at end of file diff --git a/tests/checks/switch_ssd_check/eqptFlash_single_micron_withFault.json b/tests/checks/switch_ssd_check/eqptFlash_single_micron_withFault.json new file mode 100644 index 00000000..4f9d6492 --- /dev/null +++ b/tests/checks/switch_ssd_check/eqptFlash_single_micron_withFault.json @@ -0,0 +1,14 @@ +{ + "totalCount": "1", + "imdata": [ + { + "eqptFlash": { + "attributes": { + "dn": "topology/pod-1/node-205/sys/ch/supslot-1/sup/flash", + "vendor": "Micron", + "model": "Micron_M550_MTFDDAT256MAY" + } + } + } + ] +} \ No newline at end of file diff --git a/tests/checks/switch_ssd_check/test_switch_ssd_check.py b/tests/checks/switch_ssd_check/test_switch_ssd_check.py index 1f7202f5..9c141247 100644 --- a/tests/checks/switch_ssd_check/test_switch_ssd_check.py +++ b/tests/checks/switch_ssd_check/test_switch_ssd_check.py @@ -14,18 +14,138 @@ # icurl queries faultInst = 'faultInst.json?query-target-filter=or(eq(faultInst.code,"F3073"),eq(faultInst.code,"F3074"))' +eqptFlash = 'eqptFlash.json?query-target-filter=eq(eqptFlash.vendor,"Micron")' @pytest.mark.parametrize( - "icurl_outputs, expected_result, expected_data", + "icurl_outputs, tversion, cversion, expected_result, expected_data", [ + # MANUAL - tversion missing (TVER_MISSING), no faults ( {faultInst: []}, + None, "6.0(2h)", + script.MANUAL, + [], + ), + # FAIL_O - genuine F3073/F3074 faults, version not affected + ( + {faultInst: read_data(dir, "faultInst.json")}, + "6.0(2h)", "6.0(1a)", + script.FAIL_O, + [ + [ + "F3073", + "1", + "205", + "Micron_M550_MTFDDAT256MAY", + "90%", + ], + [ + "F3074", + "1", + "101", + "Micron_M600_MTFDDAT064MBF", + "80%", + ], + ], + ), + # PASS - no faults, version not affected (Micron block skipped) + ( + {faultInst: []}, + "6.0(2h)", "6.0(1a)", script.PASS, [], ), + # PASS - no faults, tversion affected 6.1(5e), no Micron drives ( - {faultInst: read_data(dir, "faultInst.json")}, + {faultInst: [], eqptFlash: []}, + "6.1(5e)", "6.0(2h)", + script.PASS, + [], + ), + # PASS - no faults, tversion affected 6.2(1g), no Micron drives + ( + {faultInst: [], eqptFlash: []}, + "6.2(1g)", "6.0(2h)", + script.PASS, + [], + ), + # PASS - no faults, cversion affected 6.1(5e), no Micron drives + ( + {faultInst: [], eqptFlash: []}, + "6.2(2a)", "6.1(5e)", + script.PASS, + [], + ), + # MANUAL - no faults, tversion affected 6.1(5e), single Micron drive + ( + {faultInst: [], eqptFlash: read_data(dir, "eqptFlash_single_micron_noFault.json")}, + "6.1(5e)", "6.0(2h)", + script.MANUAL, + [ + [ + "CSCwt38698 (False Fault Micron SSD defect)", + "1", + "103", + "MTFDDAK240MBB", + "N/A", + ], + ], + ), + # MANUAL - no faults, multiple Micron drives, tversion affected + ( + {faultInst: [], eqptFlash: read_data(dir, "eqptFlash_multi_micron.json")}, + "6.1(5e)", "6.0(2h)", + script.MANUAL, + [ + [ + "CSCwt38698 (False Fault Micron SSD defect)", + "1", + "205", + "Micron_M550_MTFDDAT256MAY", + "N/A", + ], + [ + "CSCwt38698 (False Fault Micron SSD defect)", + "1", + "101", + "Micron_M600_MTFDDAT064MBF", + "N/A", + ], + ], + ), + # MANUAL - false fault present + cversion affected + Micron drive + ( + { + faultInst: read_data(dir, "faultInst.json"), + eqptFlash: read_data(dir, "eqptFlash_multi_micron.json"), + }, + "6.2(2a)", "6.1(5e)", + script.MANUAL, + [ + [ + "CSCwt38698 (False Fault Micron SSD defect)", + "1", + "205", + "Micron_M550_MTFDDAT256MAY", + "N/A", + ], + [ + "CSCwt38698 (False Fault Micron SSD defect)", + "1", + "101", + "Micron_M600_MTFDDAT064MBF", + "N/A", + ], + ], + ), + # FAIL_O - Genuine fault present + cversion affected + Micron drive + ( + { + faultInst: read_data(dir, "faultInst.json"), + eqptFlash: read_data(dir, "eqptFlash_single_micron_noFault.json"), + }, + "6.2(2a)", "6.1(5e)", script.FAIL_O, [ [ @@ -34,7 +154,6 @@ "205", "Micron_M550_MTFDDAT256MAY", "90%", - "Contact Cisco TAC for replacement procedure", ], [ "F3074", @@ -42,13 +161,140 @@ "101", "Micron_M600_MTFDDAT064MBF", "80%", - "Monitor (no impact to upgrades)", ], ], ), + # FAIL_O - fault present + tversion matched + Micron drive found + ( + { + faultInst: read_data(dir, "faultInst.json"), + eqptFlash: read_data(dir, "eqptFlash_single_micron_noFault.json"), + }, + "6.1(5e)", "6.0(2h)", + script.FAIL_O, + [ + [ + "F3073", + "1", + "205", + "Micron_M550_MTFDDAT256MAY", + "90%", + ], + [ + "F3074", + "1", + "101", + "Micron_M600_MTFDDAT064MBF", + "80%", + ], + ], + ), + # MANUAL - false fault present + cversion matched + Micron drive found + ( + { + faultInst: read_data(dir, "faultInst.json"), + eqptFlash: read_data(dir, "eqptFlash_multi_micron.json"), + }, + "6.2(2a)", "6.1(5e)", + script.MANUAL, + [ + [ + "CSCwt38698 (False Fault Micron SSD defect)", + "1", + "205", + "Micron_M550_MTFDDAT256MAY", + "N/A", + ], + [ + "CSCwt38698 (False Fault Micron SSD defect)", + "1", + "101", + "Micron_M600_MTFDDAT064MBF", + "N/A", + ], + ], + ), + # FAIL_O - fault present + cversion matched + Micron drive absent + ( + { + faultInst: read_data(dir, "faultInst.json"), + eqptFlash: [], + }, + "6.2(2a)", "6.1(5e)", + script.FAIL_O, + [ + [ + "F3073", + "1", + "205", + "Micron_M550_MTFDDAT256MAY", + "90%", + ], + [ + "F3074", + "1", + "101", + "Micron_M600_MTFDDAT064MBF", + "80%", + ], + ], + ), + # MANUAL - Genuine + false fault present + cversion matched + Micron drive found + ( + { + faultInst: read_data(dir, "faultInst.json"), + eqptFlash: read_data(dir, "eqptFlash_single_micron_withFault.json"), + }, + "6.2(2a)", "6.1(5e)", + script.MANUAL, + [ + [ + 'F3074', + '1', + '101', + 'Micron_M600_MTFDDAT064MBF', + '80%', + ], + [ + "CSCwt38698 (False Fault Micron SSD defect)", + "1", + "205", + "Micron_M550_MTFDDAT256MAY", + "N/A", + ], + ], + ), + # FAIL_O - Same node (205), different slots: genuine fault (Intel SSD) + ( + { + faultInst: read_data(dir, "faultInst.json"), + eqptFlash: read_data(dir, "eqptFlash_mixed_node.json"), + }, + "6.2(2a)", "6.1(5e)", + script.FAIL_O, + [ + [ + "F3073", + "1", + "205", + "Micron_M550_MTFDDAT256MAY", + "90%", + ], + [ + 'F3074', + '1', + '101', + 'Micron_M600_MTFDDAT064MBF', + '80%', + ], + ], + ), ], ) -def test_logic(run_check, mock_icurl, expected_result, expected_data): - result = run_check() +def test_logic(run_check, mock_icurl, tversion, cversion, expected_result, expected_data): + result = run_check( + tversion=script.AciVersion(tversion) if tversion else None, + cversion=script.AciVersion(cversion) if cversion else None + ) assert result.result == expected_result - assert result.data == expected_data + assert result.data == expected_data \ No newline at end of file