Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions nodescraper/plugins/inband/dmesg/dmesg_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
),
message="RAS Correctable Error",
event_category=EventCategory.RAS,
event_priority=EventPriority.WARNING,
),
ErrorRegex(
regex=re.compile(
Expand All @@ -270,6 +271,7 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
),
message="RAS Corrected PCIe Error",
event_category=EventCategory.RAS,
event_priority=EventPriority.WARNING,
),
ErrorRegex(
regex=re.compile(r"(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(.*GPU reset begin.*)"),
Expand Down Expand Up @@ -334,8 +336,18 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
event_category=EventCategory.RAS,
),
ErrorRegex(
regex=re.compile(r"\[Hardware Error\]:.+MC\d+_STATUS.*(?:\n.*){0,5}"),
message="MCE Error",
regex=re.compile(
r"\[Hardware Error\]:.+MC\d+_STATUS\[[^\]]*\|CE\|[^\]]*\].*(?:\n.*){0,5}"
),
message="MCE Corrected Error",
event_category=EventCategory.RAS,
event_priority=EventPriority.WARNING,
),
ErrorRegex(
regex=re.compile(
r"\[Hardware Error\]:.+MC\d+_STATUS\[[^\]]*\|UC\|[^\]]*\].*(?:\n.*){0,5}"
),
message="MCE Uncorrected Error",
event_category=EventCategory.RAS,
),
ErrorRegex(
Expand All @@ -351,6 +363,7 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
),
message="RAS Corrected Error",
event_category=EventCategory.RAS,
event_priority=EventPriority.WARNING,
),
ErrorRegex(
regex=re.compile(r"x86/cpu: SGX disabled by BIOS"),
Expand Down
60 changes: 53 additions & 7 deletions test/unit/plugin/test_dmesg_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,53 @@ def test_custom_regex_empty_list(system_info):
assert res.events[0].description == "Out of memory error"


def test_mce_ce_uc_and_ras_corrected_warning_priorities(system_info):
dmesg_content = (
# MCE corrected (|CE| inside MCn_STATUS[...])
"kern :err : 2038-01-19T00:00:00,000000+00:00 "
"[Hardware Error]: Machine Check: CPU0 MC0_STATUS[0xcafe|CE|Misc]: 0x0\n"
# MCE uncorrected (|UC|)
"kern :err : 2038-01-19T00:00:01,000000+00:00 "
"[Hardware Error]: Machine Check: CPU1 MC1_STATUS[0xfeed|UC|AddrV]: 0x0\n"
# RAS Corrected (single-line)
"kern :err : 2038-01-19T00:00:02,000000+00:00 "
"trace [Hardware Error]: Corrected error, DRAM threshold\n"
# RAS Correctable
"kern :err : 2038-01-19T00:00:03,000000+00:00 "
"amdgpu 0000:de:ad.0: amdgpu: socket: 0 7 correctable hardware errors detected in total in gfx block\n"
# RAS Corrected PCIe (multiline block)
"[Hardware Error]: event severity: corrected, generic\n"
"[Hardware Error]: Error 2, type: corrected, details\n"
"[Hardware Error]: section_type: PCIe error, device 1111:11:11.1\n"
)

analyzer = DmesgAnalyzer(system_info=system_info)
res = analyzer.analyze_data(
DmesgData(dmesg_content=dmesg_content),
args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False),
)

by_desc = {e.description: e for e in res.events}

assert "MCE Corrected Error" in by_desc
assert by_desc["MCE Corrected Error"].priority == EventPriority.WARNING

assert "MCE Uncorrected Error" in by_desc
assert by_desc["MCE Uncorrected Error"].priority == EventPriority.ERROR

assert "RAS Corrected Error" in by_desc
assert by_desc["RAS Corrected Error"].priority == EventPriority.WARNING

assert "RAS Correctable Error" in by_desc
assert by_desc["RAS Correctable Error"].priority == EventPriority.WARNING

assert "RAS Corrected PCIe Error" in by_desc
assert by_desc["RAS Corrected PCIe Error"].priority == EventPriority.WARNING

# UC is ERROR → overall analysis status remains ERROR
assert res.status == ExecutionStatus.ERROR


def test_resolve_priority_no_match(system_info):
"""No rule matches → returns the original priority unchanged."""
analyzer = DmesgAnalyzer(system_info=system_info)
Expand Down Expand Up @@ -870,11 +917,10 @@ def test_priority_override_rules_in_analyze_data(system_info):
"""priority_override_rules passed via DmesgAnalyzerArgs overrides matched regex priorities."""
dmesg_data = DmesgData(
dmesg_content=(
# RAS event — default ERROR, should become WARNING
"kern :err : 2024-10-07T10:17:15,145363-04:00 "
"amdgpu 0000:0c:00.0: amdgpu: socket: 4 1 correctable hardware errors detected in total in gfx block\n"
"kern :err : 2038-01-19T00:00:00,000000+00:00 "
"amdgpu 0000:de:ad.0: amdgpu: socket: 0 9 correctable hardware errors detected in total in gfx block\n"
# SW_DRIVER event — default ERROR, should stay ERROR (no matching rule)
"kern :err : 2024-10-07T10:17:15,145363-04:00 IO_PAGE_FAULT\n"
"kern :err : 2038-01-19T00:00:01,000000+00:00 IO_PAGE_FAULT\n"
)
)

Expand Down Expand Up @@ -905,8 +951,8 @@ def test_priority_override_no_change_keeps_original(system_info):
"""NO_CHANGE rule leaves the original event priority intact."""
dmesg_data = DmesgData(
dmesg_content=(
"kern :err : 2024-10-07T10:17:15,145363-04:00 "
"amdgpu 0000:0c:00.0: amdgpu: socket: 4 1 correctable hardware errors detected in total in gfx block\n"
"kern :err : 2038-01-19T00:00:00,000000+00:00 "
"amdgpu 0000:de:ad.0: amdgpu: socket: 0 9 correctable hardware errors detected in total in gfx block\n"
)
)

Expand All @@ -922,7 +968,7 @@ def test_priority_override_no_change_keeps_original(system_info):
)

assert len(res.events) == 1
assert res.events[0].priority == EventPriority.ERROR
assert res.events[0].priority == EventPriority.WARNING


def test_custom_regex_with_multiline_pattern(system_info):
Expand Down
Loading