committing changes in /etc after emerge run
Package changes: +app-admin/mcelog-122
This commit is contained in:
parent
e8f383dcfa
commit
ceaa4c576c
3
cron.daily/mcelog
Normal file
3
cron.daily/mcelog
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
header="$(date +"%b %d %H:%M:%S") $(hostname) "
|
||||||
|
/usr/sbin/mcelog --ignorenodev --filter | sed "s,^,$header,g" >> /var/log/mcelog
|
16
init.d/mcelog
Executable file
16
init.d/mcelog
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/sbin/runscript
|
||||||
|
# Copyright 1999-2014 Gentoo Foundation
|
||||||
|
# Distributed under the terms of the GNU General Public License v2
|
||||||
|
# $Id$
|
||||||
|
|
||||||
|
description='Start/stop mcelog in daemon mode'
|
||||||
|
|
||||||
|
pidfile="/var/run/mcelog.pid"
|
||||||
|
command="/usr/sbin/mcelog"
|
||||||
|
command_args="--daemon --pidfile ${pidfile} ${MCELOG_OPTS}"
|
||||||
|
|
||||||
|
depend() {
|
||||||
|
after udev
|
||||||
|
need localmount
|
||||||
|
use logging
|
||||||
|
}
|
15
logrotate.d/mcelog
Normal file
15
logrotate.d/mcelog
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
/var/log/mcelog {
|
||||||
|
compress
|
||||||
|
dateext
|
||||||
|
maxage 365
|
||||||
|
rotate 99
|
||||||
|
size=+2048k
|
||||||
|
notifempty
|
||||||
|
missingok
|
||||||
|
copytruncate
|
||||||
|
postrotate
|
||||||
|
chmod 644 /var/log/mcelog
|
||||||
|
[ -r /var/run/mcelog.pid ] && kill -USR1 `cat /var/run/mcelog.pid`
|
||||||
|
endscript
|
||||||
|
}
|
||||||
|
|
23
mcelog/bus-error-trigger
Executable file
23
mcelog/bus-error-trigger
Executable file
@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# This shell script can be executed by mcelog in daemon mode when a sockets
|
||||||
|
# receives Bus and Interconnect errors
|
||||||
|
#
|
||||||
|
# environment:
|
||||||
|
# MESSAGE Human readable consolidated error message
|
||||||
|
# LOCATION Consolidated location as a single string
|
||||||
|
# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM
|
||||||
|
# LEVEL Interconnect level
|
||||||
|
# PARTICIPATION Processor Participation (Originator, Responder or Observer)
|
||||||
|
# REQUEST Request type (read, write, prefetch, etc.)
|
||||||
|
# ORIGIN Memory or IO
|
||||||
|
# TIMEOUT The request timed out or not
|
||||||
|
#
|
||||||
|
# note: will run as mcelog configured user
|
||||||
|
# this can be changed in mcelog.conf
|
||||||
|
|
||||||
|
logger -s -p daemon.err -t mcelog "$MESSAGE"
|
||||||
|
logger -s -p daemon.err -t mcelog "Location: $LOCATION"
|
||||||
|
|
||||||
|
[ -x ./bus-error-trigger.local ] && . ./bus-error-trigger.local
|
||||||
|
|
||||||
|
exit 0
|
34
mcelog/cache-error-trigger
Executable file
34
mcelog/cache-error-trigger
Executable file
@ -0,0 +1,34 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# cache error trigger. This shell script is executed by mcelog in daemon mode
|
||||||
|
# when a CPU reports excessive corrected cache errors. This could be a indication
|
||||||
|
# for future uncorrected errors.
|
||||||
|
#
|
||||||
|
# environment:
|
||||||
|
# MESSAGE Human readable error message
|
||||||
|
# CPU Linux CPU number that triggered the error
|
||||||
|
# LEVEL Cache level affected by error
|
||||||
|
# TYPE Cache type affected by error (Data,Instruction,Generic)
|
||||||
|
# AFFECTED_CPUS List of CPUs sharing the affected cache
|
||||||
|
# SOCKETID Socket ID of affected CPU
|
||||||
|
#
|
||||||
|
# note: will run as mcelog configured user
|
||||||
|
# this can be changed in mcelog.conf
|
||||||
|
|
||||||
|
#
|
||||||
|
# offline the CPUs sharing the affected cache
|
||||||
|
#
|
||||||
|
EXIT=0
|
||||||
|
|
||||||
|
for i in $AFFECTED_CPUS ; do
|
||||||
|
logger -s -p daemon.crit -t mcelog "Offlining CPU $i due to cache error threshold"
|
||||||
|
F=$(printf "/sys/devices/system/cpu/cpu%d/online" $i)
|
||||||
|
echo 0 > $F
|
||||||
|
if [ "$(cat $F)" != "0" ] ; then
|
||||||
|
logger -s -p daemon.warn -t mcelog "Offlining CPU $i failed"
|
||||||
|
EXIT=1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -x ./cache-error-trigger.local ] && . ./cache-error-trigger.local
|
||||||
|
|
||||||
|
exit $EXIT
|
29
mcelog/dimm-error-trigger
Executable file
29
mcelog/dimm-error-trigger
Executable file
@ -0,0 +1,29 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# This shell script can be executed by mcelog in daemon mode when a DIMM
|
||||||
|
# exceeds a pre-configured error threshold
|
||||||
|
#
|
||||||
|
# environment:
|
||||||
|
# THRESHOLD human readable threshold status
|
||||||
|
# MESSAGE Human readable consolidated error message
|
||||||
|
# TOTALCOUNT total count of errors for current DIMM of CE/UC depending on
|
||||||
|
# what triggered the event
|
||||||
|
# LOCATION Consolidated location as a single string
|
||||||
|
# DMI_LOCATION DIMM location from DMI/SMBIOS if available
|
||||||
|
# DMI_NAME DIMM identifier from DMI/SMBIOS if available
|
||||||
|
# DIMM DIMM number reported by hardware
|
||||||
|
# CHANNEL Channel number reported by hardware
|
||||||
|
# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM
|
||||||
|
# CECOUNT Total corrected error count for DIMM
|
||||||
|
# UCCOUNT Total uncorrected error count for DIMM
|
||||||
|
# LASTEVENT Time stamp of event that triggered threshold (in time_t format, seconds)
|
||||||
|
# THRESHOLD_COUNT Total umber of events in current threshold time period of specific type
|
||||||
|
#
|
||||||
|
# note: will run as mcelog configured user
|
||||||
|
# this can be changed in mcelog.conf
|
||||||
|
|
||||||
|
logger -s -p daemon.err -t mcelog "$MESSAGE"
|
||||||
|
logger -s -p daemon.err -t mcelog "Location: $LOCATION"
|
||||||
|
|
||||||
|
[ -x ./dimm-error-trigger.local ] && . ./dimm-error-trigger.local
|
||||||
|
|
||||||
|
exit 0
|
23
mcelog/iomca-error-trigger
Executable file
23
mcelog/iomca-error-trigger
Executable file
@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# This shell script can be executed by mcelog in daemon mode when a sockets
|
||||||
|
# receives Bus and Interconnect errors
|
||||||
|
#
|
||||||
|
# environment:
|
||||||
|
# MESSAGE Human readable consolidated error message
|
||||||
|
# LOCATION Consolidated location as a single string
|
||||||
|
# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM
|
||||||
|
# CPU Linux CPU number that triggered the error
|
||||||
|
# SET PCI segment number
|
||||||
|
# BUS PCI bus number
|
||||||
|
# DEVICE PCI device number
|
||||||
|
# FUNCTION PCI function number
|
||||||
|
#
|
||||||
|
# note: will run as mcelog configured user
|
||||||
|
# this can be changed in mcelog.conf
|
||||||
|
|
||||||
|
logger -s -p daemon.err -t mcelog "$MESSAGE"
|
||||||
|
logger -s -p daemon.err -t mcelog "Location: $LOCATION"
|
||||||
|
|
||||||
|
[ -x ./iomca-error-trigger.local ] && . ./iomca-error-trigger.local
|
||||||
|
|
||||||
|
exit 0
|
181
mcelog/mcelog.conf
Normal file
181
mcelog/mcelog.conf
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
#
|
||||||
|
# Example config file for mcelog
|
||||||
|
# mcelog is the user space backend that decodes and process machine check events
|
||||||
|
# (cpu hardware errors) reported by the CPU to the kernel
|
||||||
|
#
|
||||||
|
|
||||||
|
# general format
|
||||||
|
#optionname = value
|
||||||
|
# white space is not allowed in value currently, except at the end where it is dropped
|
||||||
|
#
|
||||||
|
|
||||||
|
# In general all command line options that are not commands work here.
|
||||||
|
# See man mcelog or mcelog --help for a list.
|
||||||
|
# e.g. to enable the --no-syslog option use
|
||||||
|
#no-syslog = yes (or no to disable)
|
||||||
|
# when the option has a argument
|
||||||
|
#logfile = /tmp/logfile
|
||||||
|
# below are the options which are not command line options.
|
||||||
|
|
||||||
|
# Set CPU type for which mcelog decodes events:
|
||||||
|
#cpu = type
|
||||||
|
# For valid values for type please see mcelog --help.
|
||||||
|
# If this value is set incorrectly the decoded output will be likely incorrect.
|
||||||
|
# By default when this parameter is not set mcelog uses the CPU it is running on
|
||||||
|
# on very new kernels the mcelog events reported by the kernel also carry
|
||||||
|
# the CPU type which is used too when available and not overriden.
|
||||||
|
|
||||||
|
# Enable daemon mode:
|
||||||
|
#daemon = yes
|
||||||
|
# By default mcelog just processes the currently pending events and exits.
|
||||||
|
# In daemon mode it will keep running as a daemon in the background and poll
|
||||||
|
# the kernel for events and then decode them.
|
||||||
|
|
||||||
|
# Filter out known broken events by default.
|
||||||
|
filter = yes
|
||||||
|
# Don't log memory errors individually.
|
||||||
|
# They still get accounted if that is enabled.
|
||||||
|
#filter-memory-errors = yes
|
||||||
|
|
||||||
|
# output in undecoded raw format to be easier machine readable
|
||||||
|
# (default is decoded).
|
||||||
|
#raw = yes
|
||||||
|
|
||||||
|
# Set CPU Mhz to decode uptime from time stamp counter (output
|
||||||
|
# unreliable, not needed on new kernels which report the event time
|
||||||
|
# directly. A lot of systems don't have a linear time stamp clock
|
||||||
|
# and the output is wrong then.
|
||||||
|
# Normally mcelog tries to figure out if it the TSC is reliable
|
||||||
|
# and only uses the current frequency then.
|
||||||
|
# Setting a frequency forces timestamp decoding.
|
||||||
|
# This setting is obsolete with modern kernels which report the time
|
||||||
|
# directly.
|
||||||
|
#cpumhz = 1800.00
|
||||||
|
|
||||||
|
# log output options
|
||||||
|
# Log decoded machine checks in syslog (default stdout or syslog for daemon)
|
||||||
|
#syslog = yes
|
||||||
|
# Log decoded machine checks in syslog with error level
|
||||||
|
#syslog-error = yes
|
||||||
|
# Never log anything to syslog
|
||||||
|
#no-syslog = yes
|
||||||
|
# Append log output to logfile instead of stdout. Only when no syslog logging is active
|
||||||
|
#logfile = filename
|
||||||
|
|
||||||
|
# Use SMBIOS information to decode DIMMs (needs root).
|
||||||
|
# This function is not recommended to use right now and generally not needed.
|
||||||
|
# The exception is memdb prepopulation, which is configured separately below.
|
||||||
|
#dmi = no
|
||||||
|
|
||||||
|
# When in daemon mode run as this user after set up.
|
||||||
|
# Note that the triggers will run as this user too.
|
||||||
|
# Setting this to non root will mean that triggers cannot take some corrective
|
||||||
|
# action, like offlining objects.
|
||||||
|
#run-credentials-user = root
|
||||||
|
|
||||||
|
# group to run as daemon with
|
||||||
|
# default to the group of the run-credentials-user
|
||||||
|
#run-credentials-group = nobody
|
||||||
|
|
||||||
|
[server]
|
||||||
|
# user allowed to access client socket.
|
||||||
|
# when set to * match any
|
||||||
|
# root is always allowed to access.
|
||||||
|
# default: root only
|
||||||
|
client-user = root
|
||||||
|
# group allowed to access mcelog
|
||||||
|
# When no group is configured any group matches (but still user checking).
|
||||||
|
# when set to * match any
|
||||||
|
#client-group = root
|
||||||
|
# Path to the unix socket for client<->server communication.
|
||||||
|
# When no socket-path is configured the server will not start
|
||||||
|
#socket-path = /var/run/mcelog-client
|
||||||
|
# When mcelog starts it checks if a server is already running. This configures the timeout
|
||||||
|
# for this check.
|
||||||
|
#initial-ping-timeout = 2
|
||||||
|
#
|
||||||
|
[dimm]
|
||||||
|
# Is the in memory DIMM error tracking enabled?
|
||||||
|
# Only works on systems with integrated memory controller and
|
||||||
|
# which are supported.
|
||||||
|
# Only takes effect in daemon mode.
|
||||||
|
dimm-tracking-enabled = yes
|
||||||
|
# Use DMI information from the BIOS to prepopulate DIMM database.
|
||||||
|
# Note this might not work with all BIOS and requires mcelog to run as root.
|
||||||
|
# Alternative is to let mcelog create DIMM objects on demand.
|
||||||
|
dmi-prepopulate = yes
|
||||||
|
#
|
||||||
|
# Execute these triggers when the rate of corrected or uncorrected
|
||||||
|
# Errors per DIMM exceeds the threshold.
|
||||||
|
# Note when the hardware does not report DIMMs this might also
|
||||||
|
# be per channel.
|
||||||
|
# The default of 10/24h is reasonable for server quality
|
||||||
|
# DDR3 DIMMs as of 2009/10.
|
||||||
|
#uc-error-trigger = dimm-error-trigger
|
||||||
|
uc-error-threshold = 1 / 24h
|
||||||
|
#ce-error-trigger = dimm-error-trigger
|
||||||
|
ce-error-threshold = 10 / 24h
|
||||||
|
|
||||||
|
[socket]
|
||||||
|
# Enable memory error accounting per socket.
|
||||||
|
socket-tracking-enabled = yes
|
||||||
|
|
||||||
|
# Threshold and trigger for uncorrected memory errors on a socket.
|
||||||
|
# mem-uc-error-trigger = socket-memory-error-trigger
|
||||||
|
|
||||||
|
mem-uc-error-threshold = 100 / 24h
|
||||||
|
|
||||||
|
# Trigger script for corrected memory errors on a socket.
|
||||||
|
mem-ce-error-trigger = socket-memory-error-trigger
|
||||||
|
|
||||||
|
# Threshold on when to trigger a correct error for the socket.
|
||||||
|
|
||||||
|
mem-ce-error-threshold = 100 / 24h
|
||||||
|
|
||||||
|
# Log socket error threshold explicitely?
|
||||||
|
mem-ce-error-log = yes
|
||||||
|
|
||||||
|
# Trigger script for uncorrected bus error events
|
||||||
|
bus-uc-threshold-trigger = bus-error-trigger
|
||||||
|
|
||||||
|
# Trigger script for uncorrected IOMCA erors
|
||||||
|
iomca-threshold-trigger = iomca-error-trigger
|
||||||
|
|
||||||
|
# Trigger script for other uncategorized errors
|
||||||
|
unknown-threshold-trigger = unknown-error-trigger
|
||||||
|
|
||||||
|
[cache]
|
||||||
|
# Processing of cache error thresholds reported by Intel CPUs.
|
||||||
|
cache-threshold-trigger = cache-error-trigger
|
||||||
|
|
||||||
|
# Should cache threshold events be logged explicitely?
|
||||||
|
cache-threshold-log = yes
|
||||||
|
|
||||||
|
[page]
|
||||||
|
# Memory error accouting per 4K memory page.
|
||||||
|
# Threshold for the correct memory errors trigger script.
|
||||||
|
memory-ce-threshold = 10 / 24h
|
||||||
|
|
||||||
|
# Trigger script for corrected errors.
|
||||||
|
# memory-ce-trigger = page-error-trigger
|
||||||
|
|
||||||
|
# Should page threshold events be logged explicitely?
|
||||||
|
memory-ce-log = yes
|
||||||
|
|
||||||
|
# specify the internal action in mcelog to exceeding a page error threshold
|
||||||
|
# this is done in addition to executing the trigger script if available
|
||||||
|
# off no action
|
||||||
|
# account only account errors
|
||||||
|
# soft try to soft-offline page without killing any processes
|
||||||
|
# This requires an uptodate kernel. Might not be successfull.
|
||||||
|
# hard try to hard-offline page by killing processes
|
||||||
|
# Requires an uptodate kernel. Might not be successfull.
|
||||||
|
# soft-then-hard First try to soft offline, then try hard offlining
|
||||||
|
#memory-ce-action = off|account|soft|hard|soft-then-hard
|
||||||
|
memory-ce-action = soft
|
||||||
|
|
||||||
|
[trigger]
|
||||||
|
# Maximum number of running triggers
|
||||||
|
children-max = 2
|
||||||
|
# execute triggers in this directory
|
||||||
|
directory = /etc/mcelog
|
30
mcelog/page-error-trigger
Executable file
30
mcelog/page-error-trigger
Executable file
@ -0,0 +1,30 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# This shell script can be executed by mcelog in daemon mode when a page
|
||||||
|
# in memory exceeds a pre-configured corrected error threshold.
|
||||||
|
# mcelog internally also supports offlining the page through the kernel.
|
||||||
|
#
|
||||||
|
# environment:
|
||||||
|
# THRESHOLD human readable threshold status
|
||||||
|
# MESSAGE Human readable consolidated error message
|
||||||
|
# TOTALCOUNT total count of errors for current DIMM of CE/UC depending on
|
||||||
|
# what triggered the event
|
||||||
|
# LOCATION Consolidated location as a single string
|
||||||
|
# DMI_LOCATION DIMM location from DMI/SMBIOS if available
|
||||||
|
# DMI_NAME DIMM identifier from DMI/SMBIOS if available
|
||||||
|
# DIMM DIMM number reported by hardware
|
||||||
|
# CHANNEL Channel number reported by hardware
|
||||||
|
# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM
|
||||||
|
# CECOUNT Total corrected error count for DIMM
|
||||||
|
# UCCOUNT Total uncorrected error count for DIMM
|
||||||
|
# LASTEVENT Time stamp of event that triggered threshold (in time_t format, seconds)
|
||||||
|
# THRESHOLD_COUNT Total umber of events in current threshold time period of specific type
|
||||||
|
#
|
||||||
|
# note: will run as mcelog configured user
|
||||||
|
# this can be changed in mcelog.conf
|
||||||
|
|
||||||
|
logger -s -p daemon.err -t mcelog "$MESSAGE"
|
||||||
|
logger -s -p daemon.err -t mcelog "Location: $LOCATION"
|
||||||
|
|
||||||
|
[ -x ./page-error-trigger.local ] && . ./page-error-trigger.local
|
||||||
|
|
||||||
|
exit 0
|
25
mcelog/socket-memory-error-trigger
Executable file
25
mcelog/socket-memory-error-trigger
Executable file
@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# This shell script can be executed by mcelog in daemon mode when a sockets
|
||||||
|
# exceeds a pre-configured error threshold for memory errors
|
||||||
|
#
|
||||||
|
# environment:
|
||||||
|
# THRESHOLD human readable threshold status
|
||||||
|
# MESSAGE Human readable consolidated error message
|
||||||
|
# TOTALCOUNT total count of errors for current socket of CE/UC depending on
|
||||||
|
# what triggered the event
|
||||||
|
# LOCATION Consolidated location as a single string
|
||||||
|
# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM
|
||||||
|
# CECOUNT Total corrected error count for socket
|
||||||
|
# UCCOUNT Total uncorrected error count for socket
|
||||||
|
# LASTEVENT Time stamp of event that triggered threshold (in time_t format, seconds)
|
||||||
|
# THRESHOLD_COUNT Total umber of events in current threshold time period of specific type
|
||||||
|
#
|
||||||
|
# note: will run as mcelog configured user
|
||||||
|
# this can be changed in mcelog.conf
|
||||||
|
|
||||||
|
logger -s -p daemon.err -t mcelog "$MESSAGE"
|
||||||
|
logger -s -p daemon.err -t mcelog "Location: $LOCATION"
|
||||||
|
|
||||||
|
[ -x ./socket-memory-error-trigger.local ] && . ./socket-memory-error-trigger.local
|
||||||
|
|
||||||
|
exit 0
|
26
mcelog/unknown-error-trigger
Executable file
26
mcelog/unknown-error-trigger
Executable file
@ -0,0 +1,26 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# This shell script is executed by mcelog in daemon mode when
|
||||||
|
# an not otherwise handled machine check error happens.
|
||||||
|
#
|
||||||
|
# environment:
|
||||||
|
# MESSAGE Human readable consolidated error message
|
||||||
|
# LOCATION Consolidated location as a single string
|
||||||
|
# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM
|
||||||
|
# CPU Linux CPU number that triggered the error
|
||||||
|
# STATUS IA32_MCi_STATUS register value
|
||||||
|
# ADDR IA32_MCi_ADDR register value
|
||||||
|
# MISC IA32_MCi_MISC register value
|
||||||
|
# MCGSTATUS IA32_MCG_STATUS register value
|
||||||
|
# MCGCAP IA32_MCG_CAP register value
|
||||||
|
# For details on the register layout please see the Intel SDM http://www.intel.com/sdm
|
||||||
|
# volume 3, chapter 15
|
||||||
|
#
|
||||||
|
# note: will run as mcelog configured user
|
||||||
|
# this can be changed in mcelog.conf
|
||||||
|
|
||||||
|
logger -s -p daemon.err -t mcelog "$MESSAGE"
|
||||||
|
logger -s -p daemon.err -t mcelog "Location: $LOCATION"
|
||||||
|
|
||||||
|
[ -x ./unknown-error-trigger.local ] && . ./unknown-error-trigger.local
|
||||||
|
|
||||||
|
exit 0
|
Loading…
Reference in New Issue
Block a user