From ceaa4c576cc7d98eea4d3ef937f10a525def68b6 Mon Sep 17 00:00:00 2001 From: Hans Wurst Date: Sat, 15 Aug 2015 01:00:14 +0200 Subject: [PATCH] committing changes in /etc after emerge run Package changes: +app-admin/mcelog-122 --- cron.daily/mcelog | 3 + init.d/mcelog | 16 +++ logrotate.d/mcelog | 15 +++ mcelog/bus-error-trigger | 23 ++++ mcelog/cache-error-trigger | 34 ++++++ mcelog/dimm-error-trigger | 29 +++++ mcelog/iomca-error-trigger | 23 ++++ mcelog/mcelog.conf | 181 +++++++++++++++++++++++++++++ mcelog/page-error-trigger | 30 +++++ mcelog/socket-memory-error-trigger | 25 ++++ mcelog/unknown-error-trigger | 26 +++++ 11 files changed, 405 insertions(+) create mode 100644 cron.daily/mcelog create mode 100755 init.d/mcelog create mode 100644 logrotate.d/mcelog create mode 100755 mcelog/bus-error-trigger create mode 100755 mcelog/cache-error-trigger create mode 100755 mcelog/dimm-error-trigger create mode 100755 mcelog/iomca-error-trigger create mode 100644 mcelog/mcelog.conf create mode 100755 mcelog/page-error-trigger create mode 100755 mcelog/socket-memory-error-trigger create mode 100755 mcelog/unknown-error-trigger diff --git a/cron.daily/mcelog b/cron.daily/mcelog new file mode 100644 index 0000000..dd50f1b --- /dev/null +++ b/cron.daily/mcelog @@ -0,0 +1,3 @@ +#!/bin/bash +header="$(date +"%b %d %H:%M:%S") $(hostname) " +/usr/sbin/mcelog --ignorenodev --filter | sed "s,^,$header,g" >> /var/log/mcelog diff --git a/init.d/mcelog b/init.d/mcelog new file mode 100755 index 0000000..a282118 --- /dev/null +++ b/init.d/mcelog @@ -0,0 +1,16 @@ +#!/sbin/runscript +# Copyright 1999-2014 Gentoo Foundation +# Distributed under the terms of the GNU General Public License v2 +# $Id$ + +description='Start/stop mcelog in daemon mode' + +pidfile="/var/run/mcelog.pid" +command="/usr/sbin/mcelog" +command_args="--daemon --pidfile ${pidfile} ${MCELOG_OPTS}" + +depend() { + after udev + need localmount + use logging +} diff --git a/logrotate.d/mcelog b/logrotate.d/mcelog new file mode 100644 index 0000000..049f299 --- /dev/null +++ b/logrotate.d/mcelog @@ -0,0 +1,15 @@ +/var/log/mcelog { + compress + dateext + maxage 365 + rotate 99 + size=+2048k + notifempty + missingok + copytruncate + postrotate + chmod 644 /var/log/mcelog + [ -r /var/run/mcelog.pid ] && kill -USR1 `cat /var/run/mcelog.pid` + endscript +} + diff --git a/mcelog/bus-error-trigger b/mcelog/bus-error-trigger new file mode 100755 index 0000000..c996001 --- /dev/null +++ b/mcelog/bus-error-trigger @@ -0,0 +1,23 @@ +#!/bin/sh +# This shell script can be executed by mcelog in daemon mode when a sockets +# receives Bus and Interconnect errors +# +# environment: +# MESSAGE Human readable consolidated error message +# LOCATION Consolidated location as a single string +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# LEVEL Interconnect level +# PARTICIPATION Processor Participation (Originator, Responder or Observer) +# REQUEST Request type (read, write, prefetch, etc.) +# ORIGIN Memory or IO +# TIMEOUT The request timed out or not +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./bus-error-trigger.local ] && . ./bus-error-trigger.local + +exit 0 diff --git a/mcelog/cache-error-trigger b/mcelog/cache-error-trigger new file mode 100755 index 0000000..b5cc462 --- /dev/null +++ b/mcelog/cache-error-trigger @@ -0,0 +1,34 @@ +#!/bin/sh +# cache error trigger. This shell script is executed by mcelog in daemon mode +# when a CPU reports excessive corrected cache errors. This could be a indication +# for future uncorrected errors. +# +# environment: +# MESSAGE Human readable error message +# CPU Linux CPU number that triggered the error +# LEVEL Cache level affected by error +# TYPE Cache type affected by error (Data,Instruction,Generic) +# AFFECTED_CPUS List of CPUs sharing the affected cache +# SOCKETID Socket ID of affected CPU +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +# +# offline the CPUs sharing the affected cache +# +EXIT=0 + +for i in $AFFECTED_CPUS ; do + logger -s -p daemon.crit -t mcelog "Offlining CPU $i due to cache error threshold" + F=$(printf "/sys/devices/system/cpu/cpu%d/online" $i) + echo 0 > $F + if [ "$(cat $F)" != "0" ] ; then + logger -s -p daemon.warn -t mcelog "Offlining CPU $i failed" + EXIT=1 + fi +done + +[ -x ./cache-error-trigger.local ] && . ./cache-error-trigger.local + +exit $EXIT diff --git a/mcelog/dimm-error-trigger b/mcelog/dimm-error-trigger new file mode 100755 index 0000000..e00d8df --- /dev/null +++ b/mcelog/dimm-error-trigger @@ -0,0 +1,29 @@ +#!/bin/sh +# This shell script can be executed by mcelog in daemon mode when a DIMM +# exceeds a pre-configured error threshold +# +# environment: +# THRESHOLD human readable threshold status +# MESSAGE Human readable consolidated error message +# TOTALCOUNT total count of errors for current DIMM of CE/UC depending on +# what triggered the event +# LOCATION Consolidated location as a single string +# DMI_LOCATION DIMM location from DMI/SMBIOS if available +# DMI_NAME DIMM identifier from DMI/SMBIOS if available +# DIMM DIMM number reported by hardware +# CHANNEL Channel number reported by hardware +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# CECOUNT Total corrected error count for DIMM +# UCCOUNT Total uncorrected error count for DIMM +# LASTEVENT Time stamp of event that triggered threshold (in time_t format, seconds) +# THRESHOLD_COUNT Total umber of events in current threshold time period of specific type +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./dimm-error-trigger.local ] && . ./dimm-error-trigger.local + +exit 0 diff --git a/mcelog/iomca-error-trigger b/mcelog/iomca-error-trigger new file mode 100755 index 0000000..3888461 --- /dev/null +++ b/mcelog/iomca-error-trigger @@ -0,0 +1,23 @@ +#!/bin/sh +# This shell script can be executed by mcelog in daemon mode when a sockets +# receives Bus and Interconnect errors +# +# environment: +# MESSAGE Human readable consolidated error message +# LOCATION Consolidated location as a single string +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# CPU Linux CPU number that triggered the error +# SET PCI segment number +# BUS PCI bus number +# DEVICE PCI device number +# FUNCTION PCI function number +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./iomca-error-trigger.local ] && . ./iomca-error-trigger.local + +exit 0 diff --git a/mcelog/mcelog.conf b/mcelog/mcelog.conf new file mode 100644 index 0000000..f8abb99 --- /dev/null +++ b/mcelog/mcelog.conf @@ -0,0 +1,181 @@ +# +# Example config file for mcelog +# mcelog is the user space backend that decodes and process machine check events +# (cpu hardware errors) reported by the CPU to the kernel +# + +# general format +#optionname = value +# white space is not allowed in value currently, except at the end where it is dropped +# + +# In general all command line options that are not commands work here. +# See man mcelog or mcelog --help for a list. +# e.g. to enable the --no-syslog option use +#no-syslog = yes (or no to disable) +# when the option has a argument +#logfile = /tmp/logfile +# below are the options which are not command line options. + +# Set CPU type for which mcelog decodes events: +#cpu = type +# For valid values for type please see mcelog --help. +# If this value is set incorrectly the decoded output will be likely incorrect. +# By default when this parameter is not set mcelog uses the CPU it is running on +# on very new kernels the mcelog events reported by the kernel also carry +# the CPU type which is used too when available and not overriden. + +# Enable daemon mode: +#daemon = yes +# By default mcelog just processes the currently pending events and exits. +# In daemon mode it will keep running as a daemon in the background and poll +# the kernel for events and then decode them. + +# Filter out known broken events by default. +filter = yes +# Don't log memory errors individually. +# They still get accounted if that is enabled. +#filter-memory-errors = yes + +# output in undecoded raw format to be easier machine readable +# (default is decoded). +#raw = yes + +# Set CPU Mhz to decode uptime from time stamp counter (output +# unreliable, not needed on new kernels which report the event time +# directly. A lot of systems don't have a linear time stamp clock +# and the output is wrong then. +# Normally mcelog tries to figure out if it the TSC is reliable +# and only uses the current frequency then. +# Setting a frequency forces timestamp decoding. +# This setting is obsolete with modern kernels which report the time +# directly. +#cpumhz = 1800.00 + +# log output options +# Log decoded machine checks in syslog (default stdout or syslog for daemon) +#syslog = yes +# Log decoded machine checks in syslog with error level +#syslog-error = yes +# Never log anything to syslog +#no-syslog = yes +# Append log output to logfile instead of stdout. Only when no syslog logging is active +#logfile = filename + +# Use SMBIOS information to decode DIMMs (needs root). +# This function is not recommended to use right now and generally not needed. +# The exception is memdb prepopulation, which is configured separately below. +#dmi = no + +# When in daemon mode run as this user after set up. +# Note that the triggers will run as this user too. +# Setting this to non root will mean that triggers cannot take some corrective +# action, like offlining objects. +#run-credentials-user = root + +# group to run as daemon with +# default to the group of the run-credentials-user +#run-credentials-group = nobody + +[server] +# user allowed to access client socket. +# when set to * match any +# root is always allowed to access. +# default: root only +client-user = root +# group allowed to access mcelog +# When no group is configured any group matches (but still user checking). +# when set to * match any +#client-group = root +# Path to the unix socket for client<->server communication. +# When no socket-path is configured the server will not start +#socket-path = /var/run/mcelog-client +# When mcelog starts it checks if a server is already running. This configures the timeout +# for this check. +#initial-ping-timeout = 2 +# +[dimm] +# Is the in memory DIMM error tracking enabled? +# Only works on systems with integrated memory controller and +# which are supported. +# Only takes effect in daemon mode. +dimm-tracking-enabled = yes +# Use DMI information from the BIOS to prepopulate DIMM database. +# Note this might not work with all BIOS and requires mcelog to run as root. +# Alternative is to let mcelog create DIMM objects on demand. +dmi-prepopulate = yes +# +# Execute these triggers when the rate of corrected or uncorrected +# Errors per DIMM exceeds the threshold. +# Note when the hardware does not report DIMMs this might also +# be per channel. +# The default of 10/24h is reasonable for server quality +# DDR3 DIMMs as of 2009/10. +#uc-error-trigger = dimm-error-trigger +uc-error-threshold = 1 / 24h +#ce-error-trigger = dimm-error-trigger +ce-error-threshold = 10 / 24h + +[socket] +# Enable memory error accounting per socket. +socket-tracking-enabled = yes + +# Threshold and trigger for uncorrected memory errors on a socket. +# mem-uc-error-trigger = socket-memory-error-trigger + +mem-uc-error-threshold = 100 / 24h + +# Trigger script for corrected memory errors on a socket. +mem-ce-error-trigger = socket-memory-error-trigger + +# Threshold on when to trigger a correct error for the socket. + +mem-ce-error-threshold = 100 / 24h + +# Log socket error threshold explicitely? +mem-ce-error-log = yes + +# Trigger script for uncorrected bus error events +bus-uc-threshold-trigger = bus-error-trigger + +# Trigger script for uncorrected IOMCA erors +iomca-threshold-trigger = iomca-error-trigger + +# Trigger script for other uncategorized errors +unknown-threshold-trigger = unknown-error-trigger + +[cache] +# Processing of cache error thresholds reported by Intel CPUs. +cache-threshold-trigger = cache-error-trigger + +# Should cache threshold events be logged explicitely? +cache-threshold-log = yes + +[page] +# Memory error accouting per 4K memory page. +# Threshold for the correct memory errors trigger script. +memory-ce-threshold = 10 / 24h + +# Trigger script for corrected errors. +# memory-ce-trigger = page-error-trigger + +# Should page threshold events be logged explicitely? +memory-ce-log = yes + +# specify the internal action in mcelog to exceeding a page error threshold +# this is done in addition to executing the trigger script if available +# off no action +# account only account errors +# soft try to soft-offline page without killing any processes +# This requires an uptodate kernel. Might not be successfull. +# hard try to hard-offline page by killing processes +# Requires an uptodate kernel. Might not be successfull. +# soft-then-hard First try to soft offline, then try hard offlining +#memory-ce-action = off|account|soft|hard|soft-then-hard +memory-ce-action = soft + +[trigger] +# Maximum number of running triggers +children-max = 2 +# execute triggers in this directory +directory = /etc/mcelog diff --git a/mcelog/page-error-trigger b/mcelog/page-error-trigger new file mode 100755 index 0000000..59dfbdc --- /dev/null +++ b/mcelog/page-error-trigger @@ -0,0 +1,30 @@ +#!/bin/sh +# This shell script can be executed by mcelog in daemon mode when a page +# in memory exceeds a pre-configured corrected error threshold. +# mcelog internally also supports offlining the page through the kernel. +# +# environment: +# THRESHOLD human readable threshold status +# MESSAGE Human readable consolidated error message +# TOTALCOUNT total count of errors for current DIMM of CE/UC depending on +# what triggered the event +# LOCATION Consolidated location as a single string +# DMI_LOCATION DIMM location from DMI/SMBIOS if available +# DMI_NAME DIMM identifier from DMI/SMBIOS if available +# DIMM DIMM number reported by hardware +# CHANNEL Channel number reported by hardware +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# CECOUNT Total corrected error count for DIMM +# UCCOUNT Total uncorrected error count for DIMM +# LASTEVENT Time stamp of event that triggered threshold (in time_t format, seconds) +# THRESHOLD_COUNT Total umber of events in current threshold time period of specific type +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./page-error-trigger.local ] && . ./page-error-trigger.local + +exit 0 diff --git a/mcelog/socket-memory-error-trigger b/mcelog/socket-memory-error-trigger new file mode 100755 index 0000000..c5930d5 --- /dev/null +++ b/mcelog/socket-memory-error-trigger @@ -0,0 +1,25 @@ +#!/bin/sh +# This shell script can be executed by mcelog in daemon mode when a sockets +# exceeds a pre-configured error threshold for memory errors +# +# environment: +# THRESHOLD human readable threshold status +# MESSAGE Human readable consolidated error message +# TOTALCOUNT total count of errors for current socket of CE/UC depending on +# what triggered the event +# LOCATION Consolidated location as a single string +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# CECOUNT Total corrected error count for socket +# UCCOUNT Total uncorrected error count for socket +# LASTEVENT Time stamp of event that triggered threshold (in time_t format, seconds) +# THRESHOLD_COUNT Total umber of events in current threshold time period of specific type +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./socket-memory-error-trigger.local ] && . ./socket-memory-error-trigger.local + +exit 0 diff --git a/mcelog/unknown-error-trigger b/mcelog/unknown-error-trigger new file mode 100755 index 0000000..fa2866c --- /dev/null +++ b/mcelog/unknown-error-trigger @@ -0,0 +1,26 @@ +#!/bin/sh +# This shell script is executed by mcelog in daemon mode when +# an not otherwise handled machine check error happens. +# +# environment: +# MESSAGE Human readable consolidated error message +# LOCATION Consolidated location as a single string +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# CPU Linux CPU number that triggered the error +# STATUS IA32_MCi_STATUS register value +# ADDR IA32_MCi_ADDR register value +# MISC IA32_MCi_MISC register value +# MCGSTATUS IA32_MCG_STATUS register value +# MCGCAP IA32_MCG_CAP register value +# For details on the register layout please see the Intel SDM http://www.intel.com/sdm +# volume 3, chapter 15 +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./unknown-error-trigger.local ] && . ./unknown-error-trigger.local + +exit 0