summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/00-INDEX2
-rw-r--r--Documentation/admin-guide/index.rst1
-rw-r--r--Documentation/admin-guide/ras.rst1190
-rw-r--r--Documentation/driver-api/edac.rst178
-rw-r--r--Documentation/driver-api/index.rst1
-rw-r--r--Documentation/edac.txt812
-rw-r--r--MAINTAINERS3
-rw-r--r--drivers/edac/altera_edac.c1
-rw-r--r--drivers/edac/amd64_edac.h2
-rw-r--r--drivers/edac/amd76x_edac.c2
-rw-r--r--drivers/edac/amd8111_edac.c1
-rw-r--r--drivers/edac/amd8131_edac.c1
-rw-r--r--drivers/edac/cell_edac.c2
-rw-r--r--drivers/edac/cpc925_edac.c1
-rw-r--r--drivers/edac/e752x_edac.c2
-rw-r--r--drivers/edac/e7xxx_edac.c2
-rw-r--r--drivers/edac/edac_device.c79
-rw-r--r--drivers/edac/edac_device.h (renamed from drivers/edac/edac_core.h)309
-rw-r--r--drivers/edac/edac_device_sysfs.c4
-rw-r--r--drivers/edac/edac_mc.c84
-rw-r--r--drivers/edac/edac_mc.h245
-rw-r--r--drivers/edac/edac_mc_sysfs.c2
-rw-r--r--drivers/edac/edac_module.c2
-rw-r--r--drivers/edac/edac_module.h4
-rw-r--r--drivers/edac/edac_pci.c84
-rw-r--r--drivers/edac/edac_pci.h271
-rw-r--r--drivers/edac/edac_pci_sysfs.c13
-rw-r--r--drivers/edac/fsl_ddr_edac.c1
-rw-r--r--drivers/edac/ghes_edac.c2
-rw-r--r--drivers/edac/highbank_l2_edac.c1
-rw-r--r--drivers/edac/highbank_mc_edac.c1
-rw-r--r--drivers/edac/i3000_edac.c2
-rw-r--r--drivers/edac/i3200_edac.c2
-rw-r--r--drivers/edac/i5000_edac.c2
-rw-r--r--drivers/edac/i5100_edac.c1
-rw-r--r--drivers/edac/i5400_edac.c2
-rw-r--r--drivers/edac/i7300_edac.c2
-rw-r--r--drivers/edac/i7core_edac.c2
-rw-r--r--drivers/edac/i82443bxgx_edac.c2
-rw-r--r--drivers/edac/i82860_edac.c2
-rw-r--r--drivers/edac/i82875p_edac.c2
-rw-r--r--drivers/edac/i82975x_edac.c2
-rw-r--r--drivers/edac/ie31200_edac.c2
-rw-r--r--drivers/edac/layerscape_edac.c2
-rw-r--r--drivers/edac/mpc85xx_edac.c1
-rw-r--r--drivers/edac/mv64x60_edac.c1
-rw-r--r--drivers/edac/octeon_edac-l2c.c1
-rw-r--r--drivers/edac/octeon_edac-lmc.c1
-rw-r--r--drivers/edac/octeon_edac-pc.c1
-rw-r--r--drivers/edac/octeon_edac-pci.c1
-rw-r--r--drivers/edac/pasemi_edac.c2
-rw-r--r--drivers/edac/ppc4xx_edac.c2
-rw-r--r--drivers/edac/r82600_edac.c2
-rw-r--r--drivers/edac/sb_edac.c2
-rw-r--r--drivers/edac/skx_edac.c2
-rw-r--r--drivers/edac/synopsys_edac.c2
-rw-r--r--drivers/edac/tile_edac.c2
-rw-r--r--drivers/edac/x38_edac.c2
-rw-r--r--drivers/edac/xgene_edac.c1
-rw-r--r--include/linux/edac.h154
60 files changed, 2027 insertions, 1478 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 5bd4b07c2f90..c8a8eb1a2b11 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -152,8 +152,6 @@ driver-model/
- directory with info about Linux driver model.
early-userspace/
- info about initramfs, klibc, and userspace early during boot.
-edac.txt
- - information on EDAC - Error Detection And Correction
efi-stub.txt
- How to use the EFI boot stub to bypass GRUB or elilo on EFI systems.
eisa.txt
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 2681cbd24cdd..8ddae4e4299a 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -59,6 +59,7 @@ configure specific aspects of kernel behavior to your liking.
binfmt-misc
mono
java
+ ras
.. only:: subproject and html
diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst
new file mode 100644
index 000000000000..d71340e86c27
--- /dev/null
+++ b/Documentation/admin-guide/ras.rst
@@ -0,0 +1,1190 @@
+.. include:: <isonum.txt>
+
+============================================
+Reliability, Availability and Serviceability
+============================================
+
+RAS concepts
+************
+
+Reliability, Availability and Serviceability (RAS) is a concept used on
+servers meant to measure their robusteness.
+
+Reliability
+ is the probability that a system will produce correct outputs.
+
+ * Generally measured as Mean Time Between Failures (MTBF)
+ * Enhanced by features that help to avoid, detect and repair hardware faults
+
+Availability
+ is the probability that a system is operational at a given time
+
+ * Generally measured as a percentage of downtime per a period of time
+ * Often uses mechanisms to detect and correct hardware faults in
+ runtime;
+
+Serviceability (or maintainability)
+ is the simplicity and speed with which a system can be repaired or
+ maintained
+
+ * Generally measured on Mean Time Between Repair (MTBR)
+
+Improving RAS
+-------------
+
+In order to reduce systems downtime, a system should be capable of detecting
+hardware errors, and, when possible correcting them in runtime. It should
+also provide mechanisms to detect hardware degradation, in order to warn
+the system administrator to take the action of replacing a component before
+it causes data loss or system downtime.
+
+Among the monitoring measures, the most usual ones include:
+
+* CPU – detect errors at instruction execution and at L1/L2/L3 caches;
+* Memory – add error correction logic (ECC) to detect and correct errors;
+* I/O – add CRC checksums for tranfered data;
+* Storage – RAID, journal file systems, checksums,
+ Self-Monitoring, Analysis and Reporting Technology (SMART).
+
+By monitoring the number of occurrences of error detections, it is possible
+to identify if the probability of hardware errors is increasing, and, on such
+case, do a preventive maintainance to replace a degrated component while
+those errors are correctable.
+
+Types of errors
+---------------
+
+Most mechanisms used on modern systems use use technologies like Hamming
+Codes that allow error correction when the number of errors on a bit packet
+is below a threshold. If the number of errors is above, those mechanisms
+can indicate with a high degree of confidence that an error happened, but
+they can't correct.
+
+Also, sometimes an error occur on a component that it is not used. For
+example, a part of the memory that it is not currently allocated.
+
+That defines some categories of errors:
+
+* **Correctable Error (CE)** - the error detection mechanism detected and
+ corrected the error. Such errors are usually not fatal, although some
+ Kernel mechanisms allow the system administrator to consider them as fatal.
+
+* **Uncorrected Error (UE)** - the amount of errors happened above the error
+ correction threshold, and the system was unable to auto-correct.
+
+* **Fatal Error** - when an UE error happens on a critical component of the
+ system (for example, a piece of the Kernel got corrupted by an UE), the
+ only reliable way to avoid data corruption is to hang or reboot the machine.
+
+* **Non-fatal Error** - when an UE error happens on an unused component,
+ like a CPU in power down state or an unused memory bank, the system may
+ still run, eventually replacing the affected hardware by a hot spare,
+ if available.
+
+ Also, when an error happens on an userspace process, it is also possible to
+ kill such process and let userspace restart it.
+
+The mechanism for handling non-fatal errors is usually complex and may
+require the help of some userspace application, in order to apply the
+policy desired by the system administrator.
+
+Identifying a bad hardware component
+------------------------------------
+
+Just detecting a hardware flaw is usually not enough, as the system needs
+to pinpoint to the minimal replaceable unit (MRU) that should be exchanged
+to make the hardware reliable again.
+
+So, it requires not only error logging facilities, but also mechanisms that
+will translate the error message to the silkscreen or component label for
+the MRU.
+
+Typically, it is very complex for memory, as modern CPUs interlace memory
+from different memory modules, in order to provide a better performance. The
+DMI BIOS usually have a list of memory module labels, with can be obtained
+using the ``dmidecode`` tool. For example, on a desktop machine, it shows::
+
+ Memory Device
+ Total Width: 64 bits
+ Data Width: 64 bits
+ Size: 16384 MB
+ Form Factor: SODIMM
+ Set: None
+ Locator: ChannelA-DIMM0
+ Bank Locator: BANK 0
+ Type: DDR4
+ Type Detail: Synchronous
+ Speed: 2133 MHz
+ Rank: 2
+ Configured Clock Speed: 2133 MHz
+
+On the above example, a DDR4 SO-DIMM memory module is located at the
+system's memory labeled as "BANK 0", as given by the *bank locator* field.
+Please notice that, on such system, the *total width* is equal to the
+*data witdh*. It means that such memory module doesn't have error
+detection/correction mechanisms.
+
+Unfortunately, not all systems use the same field to specify the memory
+bank. On this example, from an older server, ``dmidecode`` shows::
+
+ Memory Device
+ Array Handle: 0x1000
+ Error Information Handle: Not Provided
+ Total Width: 72 bits
+ Data Width: 64 bits
+ Size: 8192 MB
+ Form Factor: DIMM
+ Set: 1
+ Locator: DIMM_A1
+ Bank Locator: Not Specified
+ Type: DDR3
+ Type Detail: Synchronous Registered (Buffered)
+ Speed: 1600 MHz
+ Rank: 2
+ Configured Clock Speed: 1600 MHz
+
+There, the DDR3 RDIMM memory module is located at the system's memory labeled
+as "DIMM_A1", as given by the *locator* field. Please notice that this
+memory module has 64 bits of *data witdh* and 72 bits of *total width*. So,
+it has 8 extra bits to be used by error detection and correction mechanisms.
+Such kind of memory is called Error-correcting code memory (ECC memory).
+
+To make things even worse, it is not uncommon that systems with different
+labels on their system's board to use exactly the same BIOS, meaning that
+the labels provided by the BIOS won't match the real ones.
+
+ECC memory
+----------
+
+As mentioned on the previous section, ECC memory has extra bits to be
+used for error correction. So, on 64 bit systems, a memory module
+has 64 bits of *data width*, and 74 bits of *total width*. So, there are
+8 bits extra bits to be used for the error detection and correction
+mechanisms. Those extra bits are called *syndrome*\ [#f1]_\ [#f2]_.
+
+So, when the cpu requests the memory controller to write a word with
+*data width*, the memory controller calculates the *syndrome* in real time,
+using Hamming code, or some other error correction code, like SECDED+,
+producing a code with *total width* size. Such code is then written
+on the memory modules.
+
+At read, the *total width* bits code is converted back, using the same
+ECC code used on write, producing a word with *data width* and a *syndrome*.
+The word with *data width* is sent to the CPU, even when errors happen.
+
+The memory controller also looks at the *syndrome* in order to check if
+there was an error, and if the ECC code was able to fix such error.
+If the error was corrected, a Corrected Error (CE) happened. If not, an
+Uncorrected Error (UE) happened.
+
+The information about the CE/UE errors is stored on some special registers
+at the memory controller and can be accessed by reading such registers,
+either by BIOS, by some special CPUs or by Linux EDAC driver. On x86 64
+bit CPUs, such errors can also be retrieved via the Machine Check
+Architecture (MCA)\ [#f3]_.
+
+.. [#f1] Please notice that several memory controllers allow operation on a
+ mode called "Lock-Step", where it groups two memory modules together,
+ doing 128-bit reads/writes. That gives 16 bits for error correction, with
+ significatively improves the error correction mechanism, at the expense
+ that, when an error happens, there's no way to know what memory module is
+ to blame. So, it has to blame both memory modules.
+
+.. [#f2] Some memory controllers also allow using memory in mirror mode.
+ On such mode, the same data is written to two memory modules. At read,
+ the system checks both memory modules, in order to check if both provide
+ identical data. On such configuration, when an error happens, there's no
+ way to know what memory module is to blame. So, it has to blame both
+ memory modules (or 4 memory modules, if the system is also on Lock-step
+ mode).
+
+.. [#f3] For more details about the Machine Check Architecture (MCA),
+ please read Documentation/x86/x86_64/machinecheck at the Kernel tree.
+
+EDAC - Error Detection And Correction
+*************************************
+
+.. note::
+
+ "bluesmoke" was the name for this device driver subsystem when it
+ was "out-of-tree" and maintained at http://bluesmoke.sourceforge.net.
+ That site is mostly archaic now and can be used only for historical
+ purposes.
+
+ When the subsystem was pushed upstream for the first time, on
+ Kernel 2.6.16, for the first time, it was renamed to ``EDAC``.
+
+Purpose
+-------
+
+The ``edac`` kernel module's goal is to detect and report hardware errors
+that occur within the computer system running under linux.
+
+Memory
+------
+
+Memory Correctable Errors (CE) and Uncorrectable Errors (UE) are the
+primary errors being harvested. These types of errors are harvested by
+the ``edac_mc`` device.
+
+Detecting CE events, then harvesting those events and reporting them,
+**can** but must not necessarily be a predictor of future UE events. With
+CE events only, the system can and will continue to operate as no data
+has been damaged yet.
+
+However, preventive maintenance and proactive part replacement of memory
+modules exhibiting CEs can reduce the likelihood of the dreaded UE events
+and system panics.
+
+Other hardware elements
+-----------------------
+
+A new feature for EDAC, the ``edac_device`` class of device, was added in
+the 2.6.23 version of the kernel.
+
+This new device type allows for non-memory type of ECC hardware detectors
+to have their states harvested and presented to userspace via the sysfs
+interface.
+
+Some architectures have ECC detectors for L1, L2 and L3 caches,
+along with DMA engines, fabric switches, main data path switches,
+interconnections, and various other hardware data paths. If the hardware
+reports it, then a edac_device device probably can be constructed to
+harvest and present that to userspace.
+
+
+PCI bus scanning
+----------------
+
+In addition, PCI devices are scanned for PCI Bus Parity and SERR Errors
+in order to determine if errors are occurring during data transfers.
+
+The presence of PCI Parity errors must be examined with a grain of salt.
+There are several add-in adapters that do **not** follow the PCI specification
+with regards to Parity generation and reporting. The specification says
+the vendor should tie the parity status bits to 0 if they do not intend
+to generate parity. Some vendors do not do this, and thus the parity bit
+can "float" giving false positives.
+
+There is a PCI device attribute located in sysfs that is checked by
+the EDAC PCI scanning code. If that attribute is set, PCI parity/error
+scanning is skipped for that device. The attribute is::
+
+ broken_parity_status
+
+and is located in ``/sys/devices/pci<XXX>/0000:XX:YY.Z`` directories for
+PCI devices.
+
+
+Versioning
+----------
+
+EDAC is composed of a "core" module (``edac_core.ko``) and several Memory
+Controller (MC) driver modules. On a given system, the CORE is loaded
+and one MC driver will be loaded. Both the CORE and the MC driver (or
+``edac_device`` driver) have individual versions that reflect current
+release level of their respective modules.
+
+Thus, to "report" on what version a system is running, one must report
+both the CORE's and the MC driver's versions.
+
+
+Loading
+-------
+
+If ``edac`` was statically linked with the kernel then no loading
+is necessary. If ``edac`` was built as modules then simply modprobe
+the ``edac`` pieces that you need. You should be able to modprobe
+hardware-specific modules and have the dependencies load the necessary
+core modules.
+
+Example::
+
+ $ modprobe amd76x_edac
+
+loads both the ``amd76x_edac.ko`` memory controller module and the
+``edac_mc.ko`` core module.
+
+
+Sysfs interface
+---------------
+
+EDAC presents a ``sysfs`` interface for control and reporting purposes. It
+lives in the /sys/devices/system/edac directory.
+
+Within this directory there currently reside 2 components:
+
+ ======= ==============================
+ mc memory controller(s) system
+ pci PCI control and status system
+ ======= ==============================
+
+
+
+Memory Controller (mc) Model
+----------------------------
+
+Each ``mc`` device controls a set of memory modules [#f4]_. These modules
+are laid out in a Chip-Select Row (``csrowX``) and Channel table (``chX``).
+There can be multiple csrows and multiple channels.
+
+.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely
+ used to refer to a memory module, although there are other memory
+ packaging alternatives, like SO-DIMM, SIMM, etc. Along this document,
+ and inside the EDAC system, the term "dimm" is used for all memory
+ modules, even when they use a different kind of packaging.
+
+Memory controllers allow for several csrows, with 8 csrows being a
+typical value. Yet, the actual number of csrows depends on the layout of
+a given motherboard, memory controller and memory module characteristics.
+
+Dual channels allow for dual data length (e. g. 128 bits, on 64 bit systems)
+data transfers to/from the CPU from/to memory. Some newer chipsets allow
+for more than 2 channels, like Fully Buffered DIMMs (FB-DIMMs) memory
+controllers. The following example will assume 2 channels:
+
+ +------------+-----------------------+
+ | Chip | Channels |
+ | Select +-----------+-----------+
+ | rows | ``ch0`` | ``ch1`` |
+ +============+===========+===========+
+ | ``csrow0`` | DIMM_A0 | DIMM_B0 |
+ +------------+ | |
+ | ``csrow1`` | | |
+ +------------+-----------+-----------+
+ | ``csrow2`` | DIMM_A1 | DIMM_B1 |
+ +------------+ | |
+ | ``csrow3`` | | |
+ +------------+-----------+-----------+
+
+In the above example, there are 4 physical slots on the motherboard
+for memory DIMMs:
+
+ +---------+---------+
+ | DIMM_A0 | DIMM_B0 |
+ +---------+---------+
+ | DIMM_A1 | DIMM_B1 |
+ +---------+---------+
+
+Labels for these slots are usually silk-screened on the motherboard.
+Slots labeled ``A`` are channel 0 in this example. Slots labeled ``B`` are
+channel 1. Notice that there are two csrows possible on a physical DIMM.
+These csrows are allocated their csrow assignment based on the slot into
+which the memory DIMM is placed. Thus, when 1 DIMM is placed in each
+Channel, the csrows cross both DIMMs.
+
+Memory DIMMs come single or dual "ranked". A rank is a populated csrow.
+Thus, 2 single ranked DIMMs, placed in slots DIMM_A0 and DIMM_B0 above
+will have just one csrow (csrow0). csrow1 will be empty. On the other
+hand, when 2 dual ranked DIMMs are similarly placed, then both csrow0
+and csrow1 will be populated. The pattern repeats itself for csrow2 and
+csrow3.
+
+The representation of the above is reflected in the directory
+tree in EDAC's sysfs interface. Starting in directory
+``/sys/devices/system/edac/mc``, each memory controller will be
+represented by its own ``mcX`` directory, where ``X`` is the
+index of the MC::
+
+ ..../edac/mc/
+ |
+ |->mc0
+ |->mc1
+ |->mc2
+ ....
+
+Under each ``mcX`` directory each ``csrowX`` is again represented by a
+``csrowX``, where ``X`` is the csrow index::
+
+ .../mc/mc0/
+ |
+ |->csrow0
+ |->csrow2
+ |->csrow3
+ ....
+
+Notice that there is no csrow1, which indicates that csrow0 is composed
+of a single ranked DIMMs. This should also apply in both Channels, in
+order to have dual-channel mode be operational. Since both csrow2 and
+csrow3 are populated, this indicates a dual ranked set of DIMMs for
+channels 0 and 1.
+
+Within each of the ``mcX`` and ``csrowX`` directories are several EDAC
+control and attribute files.
+
+``mcX`` directories
+-------------------
+
+In ``mcX`` directories are EDAC control and attribute files for
+this ``X`` instance of the memory controllers.
+
+For a description of the sysfs API, please see:
+
+ Documentation/ABI/testing/sysfs-devices-edac
+
+
+``dimmX`` or ``rankX`` directories
+----------------------------------
+
+The recommended way to use the EDAC subsystem is to look at the information
+provided by the ``dimmX`` or ``rankX`` directories [#f5]_.
+
+A typical EDAC system has the following structure under
+``/sys/devices/system/edac/``\ [#f6]_::
+
+ /sys/devices/system/edac/
+ ├── mc
+ │   ├── mc0
+ │   │   ├── ce_count
+ │   │   ├── ce_noinfo_count
+ │   │   ├── dimm0
+ │   │   │   ├── dimm_dev_type
+ │   │   │   ├── dimm_edac_mode
+ │   │   │   ├── dimm_label
+ │   │   │   ├── dimm_location
+ │   │   │   ├── dimm_mem_type
+ │   │   │   ├── size
+ │   │   │   └── uevent
+ │   │   ├── max_location
+ │   │   ├── mc_name
+ │   │   ├── reset_counters
+ │   │   ├── seconds_since_reset
+ │   │   ├── size_mb
+ │   │   ├── ue_count
+ │   │   ├── ue_noinfo_count
+ │   │   └── uevent
+ │   ├── mc1
+ │   │   ├── ce_count
+ │   │   ├── ce_noinfo_count
+ │   │   ├── dimm0
+ │   │   │   ├── dimm_dev_type
+ │   │   │   ├── dimm_edac_mode
+ │   │   │   ├── dimm_label
+ │   │   │   ├── dimm_location
+ │   │   │   ├── dimm_mem_type
+ │   │   │   ├── size
+ │   │   │   └── uevent
+ │   │   ├── max_location
+ │   │   ├── mc_name
+ │   │   ├── reset_counters
+ │   │   ├── seconds_since_reset
+ │   │   ├── size_mb
+ │   │   ├── ue_count
+ │   │   ├── ue_noinfo_count
+ │   │   └── uevent
+ │   └── uevent
+ └── uevent
+
+In the ``dimmX`` directories are EDAC control and attribute files for
+this ``X`` memory module:
+
+- ``size`` - Total memory managed by this csrow attribute file
+
+ This attribute file displays, in count of megabytes, the memory
+ that this csrow contains.
+
+- ``dimm_dev_type`` - Device type attribute file
+
+ This attribute file will display what type of DRAM device is
+ being utilized on this DIMM.
+ Examples:
+
+ - x1
+ - x2
+ - x4
+ - x8
+
+- ``dimm_edac_mode`` - EDAC Mode of operation attribute file
+
+ This attribute file will display what type of Error detection
+ and correction is being utilized.
+
+- ``dimm_label`` - memory module label control file
+
+ This control file allows this DIMM to have a label assigned
+ to it. With this label in the module, when errors occur
+ the output can provide the DIMM label in the system log.
+ This becomes vital for panic events to isolate the
+ cause of the UE event.
+
+ DIMM Labels must be assigned after booting, with information
+ that correctly identifies the physical slot with its
+ silk screen label. This information is currently very
+ motherboard specific and determination of this information
+ must occur in userland at this time.
+
+- ``dimm_location`` - location of the memory module
+
+ The location can have up to 3 levels, and describe how the
+ memory controller identifies the location of a memory module.
+ Depending on the type of memory and memory controller, it
+ can be:
+
+ - *csrow* and *channel* - used when the memory controller
+ doesn't identify a single DIMM - e. g. in ``rankX`` dir;
+ - *branch*, *channel*, *slot* - typically used on FB-DIMM memory
+ controllers;
+ - *channel*, *slot* - used on Nehalem and newer Intel drivers.
+
+- ``dimm_mem_type`` - Memory Type attribute file
+
+ This attribute file will display what type of memory is currently
+ on this csrow. Normally, either buffered or unbuffered memory.
+ Examples:
+
+ - Registered-DDR
+ - Unbuffered-DDR
+
+.. [#f5] On some systems, the memory controller doesn't have any logic
+ to identify the memory module. On such systems, the directory is called ``rankX`` and works on a similar way as the ``csrowX`` directories.
+ On modern Intel memory controllers, the memory controller identifies the
+ memory modules directly. On such systems, the directory is called ``dimmX``.
+
+.. [#f6] There are also some ``power`` directories and ``subsystem``
+ symlinks inside the sysfs mapping that are automatically created by
+ the sysfs subsystem. Currently, they serve no purpose.
+
+``csrowX`` directories
+----------------------
+
+When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the ``csrowX``
+directories. As this API doesn't work properly for Rambus, FB-DIMMs and
+modern Intel Memory Controllers, this is being deprecated in favor of
+``dimmX`` directories.
+
+In the ``csrowX`` directories are EDAC control and attribute files for
+this ``X`` instance of csrow:
+
+
+- ``ue_count`` - Total Uncorrectable Errors count attribute file
+
+ This attribute file displays the total count of uncorrectable
+ errors that have occurred on this csrow. If panic_on_ue is set
+ this counter will not have a chance to increment, since EDAC
+ will panic the system.
+
+
+- ``ce_count`` - Total Correctable Errors count attribute file
+
+ This attribute file displays the total count of correctable
+ errors that have occurred on this csrow. This count is very
+ important to examine. CEs provide early indications that a
+ DIMM is beginning to fail. This count field should be
+ monitored for non-zero values and report such information
+ to the system administrator.
+
+
+- ``size_mb`` - Total memory managed by this csrow attribute file
+
+ This attribute file displays, in count of megabytes, the memory
+ that this csrow contains.
+
+
+- ``mem_type`` - Memory Type attribute file
+
+ This attribute file will display what type of memory is currently
+ on this csrow. Normally, either buffered or unbuffered memory.
+ Examples:
+
+ - Registered-DDR
+ - Unbuffered-DDR
+
+
+- ``edac_mode`` - EDAC Mode of operation attribute file
+
+ This attribute file will display what type of Error detection
+ and correction is being utilized.
+
+
+- ``dev_type`` - Device type attribute file
+
+ This attribute file will display what type of DRAM device is
+ being utilized on this DIMM.
+ Examples:
+
+ - x1
+ - x2
+ - x4
+ - x8
+
+
+- ``ch0_ce_count`` - Channel 0 CE Count attribute file
+
+ This attribute file will display the count of CEs on this
+ DIMM located in channel 0.
+
+
+- ``ch0_ue_count`` - Channel 0 UE Count attribute file
+
+ This attribute file will display the count of UEs on this
+ DIMM located in channel 0.
+
+
+- ``ch0_dimm_label`` - Channel 0 DIMM Label control file
+
+
+ This control file allows this DIMM to have a label assigned
+ to it. With this label in the module, when errors occur
+ the output can provide the DIMM label in the system log.
+ This becomes vital for panic events to isolate the
+ cause of the UE event.
+
+ DIMM Labels must be assigned after booting, with information
+ that correctly identifies the physical slot with its
+ silk screen label. This information is currently very
+ motherboard specific and determination of this information
+ must occur in userland at this time.
+
+
+- ``ch1_ce_count`` - Channel 1 CE Count attribute file
+
+
+ This attribute file will display the count of CEs on this
+ DIMM located in channel 1.
+
+
+- ``ch1_ue_count`` - Channel 1 UE Count attribute file
+
+
+ This attribute file will display the count of UEs on this
+ DIMM located in channel 0.
+
+
+- ``ch1_dimm_label`` - Channel 1 DIMM Label control file
+
+ This control file allows this DIMM to have a label assigned
+ to it. With this label in the module, when errors occur
+ the output can provide the DIMM label in the system log.
+ This becomes vital for panic events to isolate the
+ cause of the UE event.
+
+ DIMM Labels must be assigned after booting, with information
+ that correctly identifies the physical slot with its
+ silk screen label. This information is currently very
+ motherboard specific and determination of this information
+ must occur in userland at this time.
+
+
+System Logging
+--------------
+
+If logging for UEs and CEs is enabled, then system logs will contain
+information indicating that errors have been detected::
+
+ EDAC MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0, channel 1 "DIMM_B1": amd76x_edac
+ EDAC MC0: CE page 0x1e5, offset 0xfb0, grain 8, syndrome 0xb741, row 0, channel 1 "DIMM_B1": amd76x_edac
+
+
+The structure of the message is:
+
+ +---------------------------------------+-------------+
+ | Content + Example |
+ +=======================================+=============+
+ | The memory controller | MC0 |
+ +---------------------------------------+-------------+
+ | Error type | CE |
+ +---------------------------------------+-------------+
+ | Memory page | 0x283 |
+ +---------------------------------------+-------------+
+ | Offset in the page | 0xce0 |
+ +---------------------------------------+-------------+
+ | The byte granularity | grain 8 |
+ | or resolution of the error | |
+ +---------------------------------------+-------------+
+ | The error syndrome | 0xb741 |
+ +---------------------------------------+-------------+
+ | Memory row | row 0 +
+ +---------------------------------------+-------------+
+ | Memory channel | channel 1 |
+ +---------------------------------------+-------------+
+ | DIMM label, if set prior | DIMM B1 |
+ +---------------------------------------+-------------+
+ | And then an optional, driver-specific | |
+ | message that may have additional | |
+ | information. | |
+ +---------------------------------------+-------------+
+
+Both UEs and CEs with no info will lack all but memory controller, error
+type, a notice of "no info" and then an optional, driver-specific error
+message.
+
+
+PCI Bus Parity Detection
+------------------------
+
+On Header Type 00 devices, the primary status is looked at for any
+parity error regardless of whether parity is enabled on the device or
+not. (The spec indicates parity is generated in some cases). On Header
+Type 01 bridges, the secondary status register is also looked at to see
+if parity occurred on the bus on the other side of the bridge.
+
+
+Sysfs configuration
+-------------------
+
+Under ``/sys/devices/system/edac/pci`` are control and attribute files as
+follows:
+
+
+- ``check_pci_parity`` - Enable/Disable PCI Parity checking control file
+
+ This control file enables or disables the PCI Bus Parity scanning
+ operation. Writing a 1 to this file enables the scanning. Writing
+ a 0 to this file disables the scanning.
+
+ Enable::
+
+ echo "1" >/sys/devices/system/edac/pci/check_pci_parity
+
+ Disable::
+
+ echo "0" >/sys/devices/system/edac/pci/check_pci_parity
+
+
+- ``pci_parity_count`` - Parity Count
+
+ This attribute file will display the number of parity errors that
+ have been detected.
+
+
+Module parameters
+-----------------
+
+- ``edac_mc_panic_on_ue`` - Panic on UE control file
+
+ An uncorrectable error will cause a machine panic. This is usually
+ desirable. It is a bad idea to continue when an uncorrectable error
+ occurs - it is indeterminate what was uncorrected and the operating
+ system context might be so mangled that continuing will lead to further
+ corruption. If the kernel has MCE configured, then EDAC will never
+ notice the UE.
+
+ LOAD TIME::
+
+ module/kernel parameter: edac_mc_panic_on_ue=[0|1]
+
+ RUN TIME::
+
+ echo "1" > /sys/module/edac_core/parameters/edac_mc_panic_on_ue
+
+
+- ``edac_mc_log_ue`` - Log UE control file
+
+
+ Generate kernel messages describing uncorrectable errors. These errors
+ are reported through the system message log system. UE statistics
+ will be accumulated even when UE logging is disabled.
+
+ LOAD TIME::
+
+ module/kernel parameter: edac_mc_log_ue=[0|1]
+
+ RUN TIME::
+
+ echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ue
+
+
+- ``edac_mc_log_ce`` - Log CE control file
+
+
+ Generate kernel messages describing correctable errors. These
+ errors are reported through the system message log system.
+ CE statistics will be accumulated even when CE logging is disabled.
+
+ LOAD TIME::
+
+ module/kernel parameter: edac_mc_log_ce=[0|1]
+
+ RUN TIME::
+
+ echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ce
+
+
+- ``edac_mc_poll_msec`` - Polling period control file
+
+
+ The time period, in milliseconds, for polling for error information.
+ Too small a value wastes resources. Too large a value might delay
+ necessary handling of errors and might loose valuable information for
+ locating the error. 1000 milliseconds (once each second) is the current
+ default. Systems which require all the bandwidth they can get, may
+ increase this.
+
+ LOAD TIME::
+
+ module/kernel parameter: edac_mc_poll_msec=[0|1]
+
+ RUN TIME::
+
+ echo "1000" > /sys/module/edac_core/parameters/edac_mc_poll_msec
+
+
+- ``panic_on_pci_parity`` - Panic on PCI PARITY Error
+
+
+ This control file enables or disables panicking when a parity
+ error has been detected.
+
+
+ module/kernel parameter::
+
+ edac_panic_on_pci_pe=[0|1]
+
+ Enable::
+
+ echo "1" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe
+
+ Disable::
+
+ echo "0" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe
+
+
+
+EDAC device type
+----------------
+
+In the header file, edac_pci.h, there is a series of edac_device structures
+and APIs for the EDAC_DEVICE.
+
+User space access to an edac_device is through the sysfs interface.
+
+At the location ``/sys/devices/system/edac`` (sysfs) new edac_device devices
+will appear.
+
+There is a three level tree beneath the above ``edac`` directory. For example,
+the ``test_device_edac`` device (found at the http://bluesmoke.sourceforget.net
+website) installs itself as::
+
+ /sys/devices/system/edac/test-instance
+
+in this directory are various controls, a symlink and one or more ``instance``
+directories.
+
+The standard default controls are:
+
+ ============== =======================================================
+ log_ce boolean to log CE events
+ log_ue boolean to log UE events
+ panic_on_ue boolean to ``panic`` the system if an UE is encountered
+ (default off, can be set true via startup script)
+ poll_msec time period between POLL cycles for events
+ ============== =======================================================
+
+The test_device_edac device adds at least one of its own custom control:
+
+ ============== ==================================================
+ test_bits which in the current test driver does nothing but
+ show how it is installed. A ported driver can
+ add one or more such controls and/or attributes
+ for specific uses.
+ One out-of-tree driver uses controls here to allow
+ for ERROR INJECTION operations to hardware
+ injection registers
+ ============== ==================================================
+
+The symlink points to the 'struct dev' that is registered for this edac_device.
+
+Instances
+---------
+
+One or more instance directories are present. For the ``test_device_edac``
+case:
+
+ +----------------+
+ | test-instance0 |
+ +----------------+
+
+
+In this directory there are two default counter attributes, which are totals of
+counter in deeper subdirectories.
+
+ ============== ====================================
+ ce_count total of CE events of subdirectories
+ ue_count total of UE events of subdirectories
+ ============== ====================================
+
+Blocks
+------
+
+At the lowest directory level is the ``block`` directory. There can be 0, 1
+or more blocks specified in each instance:
+
+ +-------------+
+ | test-block0 |
+ +-------------+
+
+In this directory the default attributes are:
+
+ ============== ================================================
+ ce_count which is counter of CE events for this ``block``
+ of hardware being monitored
+ ue_count which is counter of UE events for this ``block``
+ of hardware being monitored
+ ============== ================================================
+
+
+The ``test_device_edac`` device adds 4 attributes and 1 control:
+
+ ================== ====================================================
+ test-block-bits-0 for every POLL cycle this counter
+ is incremented
+ test-block-bits-1 every 10 cycles, this counter is bumped once,
+ and test-block-bits-0 is set to 0
+ test-block-bits-2 every 100 cycles, this counter is bumped once,
+ and test-block-bits-1 is set to 0
+ test-block-bits-3 every 1000 cycles, this counter is bumped once,
+ and test-block-bits-2 is set to 0
+ ================== ====================================================
+
+
+ ================== ====================================================
+ reset-counters writing ANY thing to this control will
+ reset all the above counters.
+ ================== ====================================================
+
+
+Use of the ``test_device_edac`` driver should enable any others to create their own
+unique drivers for their hardware systems.
+
+The ``test_device_edac`` sample driver is located at the
+http://bluesmoke.sourceforge.net project site for EDAC.
+
+
+Usage of EDAC APIs on Nehalem and newer Intel CPUs
+--------------------------------------------------
+
+On older Intel architectures, the memory controller was part of the North
+Bridge chipset. Nehalem, Sandy Bridge, Ivy Bridge, Haswell, Sky Lake and
+newer Intel architectures integrated an enhanced version of the memory
+controller (MC) inside the CPUs.
+
+This chapter will cover the differences of the enhanced memory controllers
+found on newer Intel CPUs, such as ``i7core_edac``, ``sb_edac`` and
+``sbx_edac`` drivers.
+
+.. note::
+
+ The Xeon E7 processor families use a separate chip for the memory
+ controller, called Intel Scalable Memory Buffer. This section doesn't
+ apply for such families.
+
+1) There is one Memory Controller per Quick Patch Interconnect
+ (QPI). At the driver, the term "socket" means one QPI. This is
+ associated with a physical CPU socket.
+
+ Each MC have 3 physical read channels, 3 physical write channels and
+ 3 logic channels. The driver currently sees it as just 3 channels.
+ Each channel can have up to 3 DIMMs.
+
+ The minimum known unity is DIMMs. There are no information about csrows.
+ As EDAC API maps the minimum unity is csrows, the driver sequentially
+ maps channel/DIMM into different csrows.
+
+ For example, supposing the following layout::
+
+ Ch0 phy rd0, wr0 (0x063f4031): 2 ranks, UDIMMs
+ dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
+ dimm 1 1024 Mb offset: 4, bank: 8, rank: 1, row: 0x4000, col: 0x400
+ Ch1 phy rd1, wr1 (0x063f4031): 2 ranks, UDIMMs
+ dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
+ Ch2 phy rd3, wr3 (0x063f4031): 2 ranks, UDIMMs
+ dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
+
+ The driver will map it as::
+
+ csrow0: channel 0, dimm0
+ csrow1: channel 0, dimm1
+ csrow2: channel 1, dimm0
+ csrow3: channel 2, dimm0
+
+ exports one DIMM per csrow.
+
+ Each QPI is exported as a different memory controller.
+
+2) The MC has the ability to inject errors to test drivers. The drivers
+ implement this functionality via some error injection nodes:
+
+ For injecting a memory error, there are some sysfs nodes, under
+ ``/sys/devices/system/edac/mc/mc?/``:
+
+ - ``inject_addrmatch/*``:
+ Controls the error injection mask register. It is possible to specify
+ several characteristics of the address to match an error code::
+
+ dimm = the affected dimm. Numbers are relative to a channel;
+ rank = the memory rank;
+ channel = the channel that will generate an error;
+ bank = the affected bank;
+ page = the page address;
+ column (or col) = the address column.
+
+ each of the above values can be set to "any" to match any valid value.
+
+ At driver init, all values are set to any.
+
+ For example, to generate an error at rank 1 of dimm 2, for any channel,
+ any bank, any page, any column::
+
+ echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm
+ echo 1 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank
+
+ To return to the default behaviour of matching any, you can do::
+
+ echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm
+ echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank
+
+ - ``inject_eccmask``:
+ specifies what bits will have troubles,
+
+ - ``inject_section``:
+ specifies what ECC cache section will get the error::
+
+ 3 for both
+ 2 for the highest
+ 1 for the lowest
+
+ - ``inject_type``:
+ specifies the type of error, being a combination of the following bits::
+
+ bit 0 - repeat
+ bit 1 - ecc
+ bit 2 - parity
+
+ - ``inject_enable``:
+ starts the error generation when something different than 0 is written.
+
+ All inject vars can be read. root permission is needed for write.
+
+ Datasheet states that the error will only be generated after a write on an
+ address that matches inject_addrmatch. It seems, however, that reading will
+ also produce an error.
+
+ For example, the following code will generate an error for any write access
+ at socket 0, on any DIMM/address on channel 2::
+
+ echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/channel
+ echo 2 >/sys/devices/system/edac/mc/mc0/inject_type
+ echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask
+ echo 3 >/sys/devices/system/edac/mc/mc0/inject_section
+ echo 1 >/sys/devices/system/edac/mc/mc0/inject_enable
+ dd if=/dev/mem of=/dev/null seek=16k bs=4k count=1 >& /dev/null
+
+ For socket 1, it is needed to replace "mc0" by "mc1" at the above
+ commands.
+
+ The generated error message will look like::
+
+ EDAC MC0: UE row 0, channel-a= 0 channel-b= 0 labels "-": NON_FATAL (addr = 0x0075b980, socket=0, Dimm=0, Channel=2, syndrome=0x00000040, count=1, Err=8c0000400001009f:4000080482 (read error: read ECC error))
+
+3) Corrected Error memory register counters
+
+ Those newer MCs have some registers to count memory errors. The driver
+ uses those registers to report Corrected Errors on devices with Registered
+ DIMMs.
+
+ However, those counters don't work with Unregistered DIMM. As the chipset
+ offers some counters that also work with UDIMMs (but with a worse level of
+ granularity than the default ones), the driver exposes those registers for
+ UDIMM memories.
+
+ They can be read by looking at the contents of ``all_channel_counts/``::
+
+ $ for i in /sys/devices/system/edac/mc/mc0/all_channel_counts/*; do echo $i; cat $i; done
+ /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm0
+ 0
+ /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm1
+ 0
+ /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm2
+ 0
+
+ What happens here is that errors on different csrows, but at the same
+ dimm number will increment the same counter.
+ So, in this memory mapping::
+
+ csrow0: channel 0, dimm0
+ csrow1: channel 0, dimm1
+ csrow2: channel 1, dimm0
+ csrow3: channel 2, dimm0
+
+ The hardware will increment udimm0 for an error at the first dimm at either
+ csrow0, csrow2 or csrow3;
+
+ The hardware will increment udimm1 for an error at the second dimm at either
+ csrow0, csrow2 or csrow3;
+
+ The hardware will increment udimm2 for an error at the third dimm at either
+ csrow0, csrow2 or csrow3;
+
+4) Standard error counters
+
+ The standard error counters are generated when an mcelog error is received
+ by the driver. Since, with UDIMM, this is counted by software, it is
+ possible that some errors could be lost. With RDIMM's, they display the
+ contents of the registers
+
+Reference documents used on ``amd64_edac``
+------------------------------------------
+
+``amd64_edac`` module is based on the following documents
+(available from http://support.amd.com/en-us/search/tech-docs):
+
+1. :Title: BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD
+ Opteron Processors
+ :AMD publication #: 26094
+ :Revision: 3.26
+ :Link: http://support.amd.com/TechDocs/26094.PDF
+
+2. :Title: BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh
+ Processors
+ :AMD publication #: 32559
+ :Revision: 3.00
+ :Issue Date: May 2006
+ :Link: http://support.amd.com/TechDocs/32559.pdf
+
+3. :Title: BIOS and Kernel Developer's Guide (BKDG) For AMD Family 10h
+ Processors
+ :AMD publication #: 31116
+ :Revision: 3.00
+ :Issue Date: September 07, 2007
+ :Link: http://support.amd.com/TechDocs/31116.pdf
+
+4. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h
+ Models 30h-3Fh Processors
+ :AMD publication #: 49125
+ :Revision: 3.06
+ :Issue Date: 2/12/2015 (latest release)
+ :Link: http://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf
+
+5. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h
+ Models 60h-6Fh Processors
+ :AMD publication #: 50742
+ :Revision: 3.01
+ :Issue Date: 7/23/2015 (latest release)
+ :Link: http://support.amd.com/TechDocs/50742_15h_Models_60h-6Fh_BKDG.pdf
+
+6. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 16h
+ Models 00h-0Fh Processors
+ :AMD publication #: 48751
+ :Revision: 3.03
+ :Issue Date: 2/23/2015 (latest release)
+ :Link: http://support.amd.com/TechDocs/48751_16h_bkdg.pdf
+
+Credits
+=======
+
+* Written by Doug Thompson <dougthompson@xmission.com>
+
+ - 7 Dec 2005
+ - 17 Jul 2007 Updated
+
+* |copy| Mauro Carvalho Chehab
+
+ - 05 Aug 2009 Nehalem interface
+ - 26 Oct 2016 Converted to ReST and cleanups at the Nehalem section
+
+* EDAC authors/maintainers:
+
+ - Doug Thompson, Dave Jiang, Dave Peterson et al,
+ - Mauro Carvalho Chehab
+ - Borislav Petkov
+ - original author: Thayne Harbaugh
diff --git a/Documentation/driver-api/edac.rst b/Documentation/driver-api/edac.rst
new file mode 100644
index 000000000000..b8c742aa0a71
--- /dev/null
+++ b/Documentation/driver-api/edac.rst
@@ -0,0 +1,178 @@
+Error Detection And Correction (EDAC) Devices
+=============================================
+
+Main Concepts used at the EDAC subsystem
+----------------------------------------
+
+There are several things to be aware of that aren't at all obvious, like
+*sockets, *socket sets*, *banks*, *rows*, *chip-select rows*, *channels*,
+etc...
+
+These are some of the many terms that are thrown about that don't always
+mean what people think they mean (Inconceivable!). In the interest of
+creating a common ground for discussion, terms and their definitions
+will be established.
+
+* Memory devices
+
+The individual DRAM chips on a memory stick. These devices commonly
+output 4 and 8 bits each (x4, x8). Grouping several of these in parallel
+provides the number of bits that the memory controller expects:
+typically 72 bits, in order to provide 64 bits + 8 bits of ECC data.
+
+* Memory Stick
+
+A printed circuit board that aggregates multiple memory devices in
+parallel. In general, this is the Field Replaceable Unit (FRU) which
+gets replaced, in the case of excessive errors. Most often it is also
+called DIMM (Dual Inline Memory Module).
+
+* Memory Socket
+
+A physical connector on the motherboard that accepts a single memory
+stick. Also called as "slot" on several datasheets.
+
+* Channel
+
+A memory controller channel, responsible to communicate with a group of
+DIMMs. Each channel has its own independent control (command) and data
+bus, and can be used independently or grouped with other channels.
+
+* Branch
+
+It is typically the highest hierarchy on a Fully-Buffered DIMM memory
+controller. Typically, it contains two channels. Two channels at the
+same branch can be used in single mode or in lockstep mode. When
+lockstep is enabled, the cacheline is doubled, but it generally brings
+some performance penalty. Also, it is generally not possible to point to
+just one memory stick when an error occurs, as the error correction code
+is calculated using two DIMMs instead of one. Due to that, it is capable
+of correcting more errors than on single mode.
+
+* Single-channel
+
+The data accessed by the memory controller is contained into one dimm
+only. E. g. if the data is 64 bits-wide, the data flows to the CPU using
+one 64 bits parallel access. Typically used with SDR, DDR, DDR2 and DDR3
+memories. FB-DIMM and RAMBUS use a different concept for channel, so
+this concept doesn't apply there.
+
+* Double-channel
+
+The data size accessed by the memory controller is interlaced into two
+dimms, accessed at the same time. E. g. if the DIMM is 64 bits-wide (72
+bits with ECC), the data flows to the CPU using a 128 bits parallel
+access.
+
+* Chip-select row
+
+This is the name of the DRAM signal used to select the DRAM ranks to be
+accessed. Common chip-select rows for single channel are 64 bits, for
+dual channel 128 bits. It may not be visible by the memory controller,
+as some DIMM types have a memory buffer that can hide direct access to
+it from the Memory Controller.
+
+* Single-Ranked stick
+
+A Single-ranked stick has 1 chip-select row of memory. Motherboards
+commonly drive two chip-select pins to a memory stick. A single-ranked
+stick, will occupy only one of those rows. The other will be unused.
+
+.. _doubleranked:
+
+* Double-Ranked stick
+
+A double-ranked stick has two chip-select rows which access different
+sets of memory devices. The two rows cannot be accessed concurrently.
+
+* Double-sided stick
+
+**DEPRECATED TERM**, see :ref:`Double-Ranked stick <doubleranked>`.
+
+A double-sided stick has two chip-select rows which access different sets
+of memory devices. The two rows cannot be accessed concurrently.
+"Double-sided" is irrespective of the memory devices being mounted on
+both sides of the memory stick.
+
+* Socket set
+
+All of the memory sticks that are required for a single memory access or
+all of the memory sticks spanned by a chip-select row. A single socket
+set has two chip-select rows and if double-sided sticks are used these
+will occupy those chip-select rows.
+
+* Bank
+
+This term is avoided because it is unclear when needing to distinguish
+between chip-select rows and socket sets.
+
+
+Memory Controllers
+------------------
+
+Most of the EDAC core is focused on doing Memory Controller error detection.
+The :c:func:`edac_mc_alloc`. It uses internally the struct ``mem_ctl_info``
+to describe the memory controllers, with is an opaque struct for the EDAC
+drivers. Only the EDAC core is allowed to touch it.
+
+.. kernel-doc:: include/linux/edac.h
+
+.. kernel-doc:: drivers/edac/edac_mc.h
+
+PCI Controllers
+---------------
+
+The EDAC subsystem provides a mechanism to handle PCI controllers by calling
+the :c:func:`edac_pci_alloc_ctl_info`. It will use the struct
+:c:type:`edac_pci_ctl_info` to describe the PCI controllers.
+
+.. kernel-doc:: drivers/edac/edac_pci.h
+
+EDAC Blocks
+-----------
+
+The EDAC subsystem also provides a generic mechanism to report errors on
+other parts of the hardware via :c:func:`edac_device_alloc_ctl_info` function.
+
+The structures :c:type:`edac_dev_sysfs_block_attribute`,
+:c:type:`edac_device_block`, :c:type:`edac_device_instance` and
+:c:type:`edac_device_ctl_info` provide a generic or abstract 'edac_device'
+representation at sysfs.
+
+This set of structures and the code that implements the APIs for the same, provide for registering EDAC type devices which are NOT standard memory or
+PCI, like:
+
+- CPU caches (L1 and L2)
+- DMA engines
+- Core CPU switches
+- Fabric switch units
+- PCIe interface controllers
+- other EDAC/ECC type devices that can be monitored for
+ errors, etc.
+
+It allows for a 2 level set of hierarchy.
+
+For example, a cache could be composed of L1, L2 and L3 levels of cache.
+Each CPU core would have its own L1 cache, while sharing L2 and maybe L3
+caches. On such case, those can be represented via the following sysfs
+nodes::
+
+ /sys/devices/system/edac/..
+
+ pci/ <existing pci directory (if available)>
+ mc/ <existing memory device directory>
+ cpu/cpu0/.. <L1 and L2 block directory>
+ /L1-cache/ce_count
+ /ue_count
+ /L2-cache/ce_count
+ /ue_count
+ cpu/cpu1/.. <L1 and L2 block directory>
+ /L1-cache/ce_count
+ /ue_count
+ /L2-cache/ce_count
+ /ue_count
+ ...
+
+ the L1 and L2 directories would be "edac_device_block's"
+
+.. kernel-doc:: drivers/edac/edac_device.h
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index a528178a54a5..5475a2807e7a 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -26,6 +26,7 @@ available subsections can be seen below.
spi
i2c
hsi
+ edac
miscellaneous
vme
80211/index
diff --git a/Documentation/edac.txt b/Documentation/edac.txt
deleted file mode 100644
index f89cfd85ae13..000000000000
--- a/Documentation/edac.txt
+++ /dev/null
@@ -1,812 +0,0 @@
-EDAC - Error Detection And Correction
-=====================================
-
-"bluesmoke" was the name for this device driver when it
-was "out-of-tree" and maintained at sourceforge.net -
-bluesmoke.sourceforge.net. That site is mostly archaic now and can be
-used only for historical purposes.
-
-When the subsystem was pushed into 2.6.16 for the first time, it was
-renamed to 'EDAC'.
-
-PURPOSE
--------
-
-The 'edac' kernel module's goal is to detect and report hardware errors
-that occur within the computer system running under linux.
-
-MEMORY
-------
-
-Memory Correctable Errors (CE) and Uncorrectable Errors (UE) are the
-primary errors being harvested. These types of errors are harvested by
-the 'edac_mc' device.
-
-Detecting CE events, then harvesting those events and reporting them,
-*can* but must not necessarily be a predictor of future UE events. With
-CE events only, the system can and will continue to operate as no data
-has been damaged yet.
-
-However, preventive maintenance and proactive part replacement of memory
-DIMMs exhibiting CEs can reduce the likelihood of the dreaded UE events
-and system panics.
-
-OTHER HARDWARE ELEMENTS
------------------------
-
-A new feature for EDAC, the edac_device class of device, was added in
-the 2.6.23 version of the kernel.
-
-This new device type allows for non-memory type of ECC hardware detectors
-to have their states harvested and presented to userspace via the sysfs
-interface.
-
-Some architectures have ECC detectors for L1, L2 and L3 caches,
-along with DMA engines, fabric switches, main data path switches,
-interconnections, and various other hardware data paths. If the hardware
-reports it, then a edac_device device probably can be constructed to
-harvest and present that to userspace.
-
-
-PCI BUS SCANNING
-----------------
-
-In addition, PCI devices are scanned for PCI Bus Parity and SERR Errors
-in order to determine if errors are occurring during data transfers.
-
-The presence of PCI Parity errors must be examined with a grain of salt.
-There are several add-in adapters that do *not* follow the PCI specification
-with regards to Parity generation and reporting. The specification says
-the vendor should tie the parity status bits to 0 if they do not intend
-to generate parity. Some vendors do not do this, and thus the parity bit
-can "float" giving false positives.
-
-There is a PCI device attribute located in sysfs that is checked by
-the EDAC PCI scanning code. If that attribute is set, PCI parity/error
-scanning is skipped for that device. The attribute is:
-
- broken_parity_status
-
-and is located in /sys/devices/pci<XXX>/0000:XX:YY.Z directories for
-PCI devices.
-
-
-VERSIONING
-----------
-
-EDAC is composed of a "core" module (edac_core.ko) and several Memory
-Controller (MC) driver modules. On a given system, the CORE is loaded
-and one MC driver will be loaded. Both the CORE and the MC driver (or
-edac_device driver) have individual versions that reflect current
-release level of their respective modules.
-
-Thus, to "report" on what version a system is running, one must report
-both the CORE's and the MC driver's versions.
-
-
-LOADING
--------
-
-If 'edac' was statically linked with the kernel then no loading
-is necessary. If 'edac' was built as modules then simply modprobe
-the 'edac' pieces that you need. You should be able to modprobe
-hardware-specific modules and have the dependencies load the necessary
-core modules.
-
-Example:
-
-$> modprobe amd76x_edac
-
-loads both the amd76x_edac.ko memory controller module and the edac_mc.ko
-core module.
-
-
-SYSFS INTERFACE
----------------
-
-EDAC presents a 'sysfs' interface for control and reporting purposes. It
-lives in the /sys/devices/system/edac directory.
-
-Within this directory there currently reside 2 components:
-
- mc memory controller(s) system
- pci PCI control and status system
-
-
-
-Memory Controller (mc) Model
-----------------------------
-
-Each 'mc' device controls a set of DIMM memory modules. These modules
-are laid out in a Chip-Select Row (csrowX) and Channel table (chX).
-There can be multiple csrows and multiple channels.
-
-Memory controllers allow for several csrows, with 8 csrows being a
-typical value. Yet, the actual number of csrows depends on the layout of
-a given motherboard, memory controller and DIMM characteristics.
-
-Dual channels allows for 128 bit data transfers to/from the CPU from/to
-memory. Some newer chipsets allow for more than 2 channels, like Fully
-Buffered DIMMs (FB-DIMMs). The following example will assume 2 channels:
-
-
- Channel 0 Channel 1
- ===================================
- csrow0 | DIMM_A0 | DIMM_B0 |
- csrow1 | DIMM_A0 | DIMM_B0 |
- ===================================
-
- ===================================
- csrow2 | DIMM_A1 | DIMM_B1 |
- csrow3 | DIMM_A1 | DIMM_B1 |
- ===================================
-
-In the above example table there are 4 physical slots on the motherboard
-for memory DIMMs:
-
- DIMM_A0
- DIMM_B0
- DIMM_A1
- DIMM_B1
-
-Labels for these slots are usually silk-screened on the motherboard.
-Slots labeled 'A' are channel 0 in this example. Slots labeled 'B' are
-channel 1. Notice that there are two csrows possible on a physical DIMM.
-These csrows are allocated their csrow assignment based on the slot into
-which the memory DIMM is placed. Thus, when 1 DIMM is placed in each
-Channel, the csrows cross both DIMMs.
-
-Memory DIMMs come single or dual "ranked". A rank is a populated csrow.
-Thus, 2 single ranked DIMMs, placed in slots DIMM_A0 and DIMM_B0 above
-will have 1 csrow, csrow0. csrow1 will be empty. On the other hand,
-when 2 dual ranked DIMMs are similarly placed, then both csrow0 and
-csrow1 will be populated. The pattern repeats itself for csrow2 and
-csrow3.
-
-The representation of the above is reflected in the directory
-tree in EDAC's sysfs interface. Starting in directory
-/sys/devices/system/edac/mc each memory controller will be represented
-by its own 'mcX' directory, where 'X' is the index of the MC.
-
-
- ..../edac/mc/
- |
- |->mc0
- |->mc1
- |->mc2
- ....
-
-Under each 'mcX' directory each 'csrowX' is again represented by a
-'csrowX', where 'X' is the csrow index:
-
-
- .../mc/mc0/
- |
- |->csrow0
- |->csrow2
- |->csrow3
- ....
-
-Notice that there is no csrow1, which indicates that csrow0 is composed
-of a single ranked DIMMs. This should also apply in both Channels, in
-order to have dual-channel mode be operational. Since both csrow2 and
-csrow3 are populated, this indicates a dual ranked set of DIMMs for
-channels 0 and 1.
-
-
-Within each of the 'mcX' and 'csrowX' directories are several EDAC
-control and attribute files.
-
-
-'mcX' directories
------------------
-
-In 'mcX' directories are EDAC control and attribute files for
-this 'X' instance of the memory controllers.
-
-For a description of the sysfs API, please see:
- Documentation/ABI/testing/sysfs-devices-edac
-
-
-
-'csrowX' directories
---------------------
-
-When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the csrowX
-directories. As this API doesn't work properly for Rambus, FB-DIMMs and
-modern Intel Memory Controllers, this is being deprecated in favor of
-dimmX directories.
-
-In the 'csrowX' directories are EDAC control and attribute files for
-this 'X' instance of csrow:
-
-
-Total Uncorrectable Errors count attribute file:
-
- 'ue_count'
-
- This attribute file displays the total count of uncorrectable
- errors that have occurred on this csrow. If panic_on_ue is set
- this counter will not have a chance to increment, since EDAC
- will panic the system.
-
-
-Total Correctable Errors count attribute file:
-
- 'ce_count'
-
- This attribute file displays the total count of correctable
- errors that have occurred on this csrow. This count is very
- important to examine. CEs provide early indications that a
- DIMM is beginning to fail. This count field should be
- monitored for non-zero values and report such information
- to the system administrator.
-
-
-Total memory managed by this csrow attribute file:
-
- 'size_mb'
-
- This attribute file displays, in count of megabytes, the memory
- that this csrow contains.
-
-
-Memory Type attribute file:
-
- 'mem_type'
-
- This attribute file will display what type of memory is currently
- on this csrow. Normally, either buffered or unbuffered memory.
- Examples:
- Registered-DDR
- Unbuffered-DDR
-
-
-EDAC Mode of operation attribute file:
-
- 'edac_mode'
-
- This attribute file will display what type of Error detection
- and correction is being utilized.
-
-
-Device type attribute file:
-
- 'dev_type'
-
- This attribute file will display what type of DRAM device is
- being utilized on this DIMM.
- Examples:
- x1
- x2
- x4
- x8
-
-
-Channel 0 CE Count attribute file:
-
- 'ch0_ce_count'
-
- This attribute file will display the count of CEs on this
- DIMM located in channel 0.
-
-
-Channel 0 UE Count attribute file:
-
- 'ch0_ue_count'
-
- This attribute file will display the count of UEs on this
- DIMM located in channel 0.
-
-
-Channel 0 DIMM Label control file:
-
- 'ch0_dimm_label'
-
- This control file allows this DIMM to have a label assigned
- to it. With this label in the module, when errors occur
- the output can provide the DIMM label in the system log.
- This becomes vital for panic events to isolate the
- cause of the UE event.
-
- DIMM Labels must be assigned after booting, with information
- that correctly identifies the physical slot with its
- silk screen label. This information is currently very
- motherboard specific and determination of this information
- must occur in userland at this time.
-
-
-Channel 1 CE Count attribute file:
-
- 'ch1_ce_count'
-
- This attribute file will display the count of CEs on this
- DIMM located in channel 1.
-
-
-Channel 1 UE Count attribute file:
-
- 'ch1_ue_count'
-
- This attribute file will display the count of UEs on this
- DIMM located in channel 0.
-
-
-Channel 1 DIMM Label control file:
-
- 'ch1_dimm_label'
-
- This control file allows this DIMM to have a label assigned
- to it. With this label in the module, when errors occur
- the output can provide the DIMM label in the system log.
- This becomes vital for panic events to isolate the
- cause of the UE event.
-
- DIMM Labels must be assigned after booting, with information
- that correctly identifies the physical slot with its
- silk screen label. This information is currently very
- motherboard specific and determination of this information
- must occur in userland at this time.
-
-
-
-SYSTEM LOGGING
---------------
-
-If logging for UEs and CEs is enabled, then system logs will contain
-information indicating that errors have been detected:
-
-EDAC MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0,
-channel 1 "DIMM_B1": amd76x_edac
-
-EDAC MC0: CE page 0x1e5, offset 0xfb0, grain 8, syndrome 0xb741, row 0,
-channel 1 "DIMM_B1": amd76x_edac
-
-
-The structure of the message is:
- the memory controller (MC0)
- Error type (CE)
- memory page (0x283)
- offset in the page (0xce0)
- the byte granularity (grain 8)
- or resolution of the error
- the error syndrome (0xb741)
- memory row (row 0)
- memory channel (channel 1)
- DIMM label, if set prior (DIMM B1
- and then an optional, driver-specific message that may
- have additional information.
-
-Both UEs and CEs with no info will lack all but memory controller, error
-type, a notice of "no info" and then an optional, driver-specific error
-message.
-
-
-PCI Bus Parity Detection
-------------------------
-
-On Header Type 00 devices, the primary status is looked at for any
-parity error regardless of whether parity is enabled on the device or
-not. (The spec indicates parity is generated in some cases). On Header
-Type 01 bridges, the secondary status register is also looked at to see
-if parity occurred on the bus on the other side of the bridge.
-
-
-SYSFS CONFIGURATION
--------------------
-
-Under /sys/devices/system/edac/pci are control and attribute files as follows:
-
-
-Enable/Disable PCI Parity checking control file:
-
- 'check_pci_parity'
-
-
- This control file enables or disables the PCI Bus Parity scanning
- operation. Writing a 1 to this file enables the scanning. Writing
- a 0 to this file disables the scanning.
-
- Enable:
- echo "1" >/sys/devices/system/edac/pci/check_pci_parity
-
- Disable:
- echo "0" >/sys/devices/system/edac/pci/check_pci_parity
-
-
-Parity Count:
-
- 'pci_parity_count'
-
- This attribute file will display the number of parity errors that
- have been detected.
-
-
-
-MODULE PARAMETERS
------------------
-
-Panic on UE control file:
-
- 'edac_mc_panic_on_ue'
-
- An uncorrectable error will cause a machine panic. This is usually
- desirable. It is a bad idea to continue when an uncorrectable error
- occurs - it is indeterminate what was uncorrected and the operating
- system context might be so mangled that continuing will lead to further
- corruption. If the kernel has MCE configured, then EDAC will never
- notice the UE.
-
- LOAD TIME: module/kernel parameter: edac_mc_panic_on_ue=[0|1]
-
- RUN TIME: echo "1" > /sys/module/edac_core/parameters/edac_mc_panic_on_ue
-
-
-Log UE control file:
-
- 'edac_mc_log_ue'
-
- Generate kernel messages describing uncorrectable errors. These errors
- are reported through the system message log system. UE statistics
- will be accumulated even when UE logging is disabled.
-
- LOAD TIME: module/kernel parameter: edac_mc_log_ue=[0|1]
-
- RUN TIME: echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ue
-
-
-Log CE control file:
-
- 'edac_mc_log_ce'
-
- Generate kernel messages describing correctable errors. These
- errors are reported through the system message log system.
- CE statistics will be accumulated even when CE logging is disabled.
-
- LOAD TIME: module/kernel parameter: edac_mc_log_ce=[0|1]
-
- RUN TIME: echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ce
-
-
-Polling period control file:
-
- 'edac_mc_poll_msec'
-
- The time period, in milliseconds, for polling for error information.
- Too small a value wastes resources. Too large a value might delay
- necessary handling of errors and might loose valuable information for
- locating the error. 1000 milliseconds (once each second) is the current
- default. Systems which require all the bandwidth they can get, may
- increase this.
-
- LOAD TIME: module/kernel parameter: edac_mc_poll_msec=[0|1]
-
- RUN TIME: echo "1000" > /sys/module/edac_core/parameters/edac_mc_poll_msec
-
-
-Panic on PCI PARITY Error:
-
- 'panic_on_pci_parity'
-
-
- This control file enables or disables panicking when a parity
- error has been detected.
-
-
- module/kernel parameter: edac_panic_on_pci_pe=[0|1]
-
- Enable:
- echo "1" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe
-
- Disable:
- echo "0" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe
-
-
-
-EDAC device type
-----------------
-
-In the header file, edac_core.h, there is a series of edac_device structures
-and APIs for the EDAC_DEVICE.
-
-User space access to an edac_device is through the sysfs interface.
-
-At the location /sys/devices/system/edac (sysfs) new edac_device devices will
-appear.
-
-There is a three level tree beneath the above 'edac' directory. For example,
-the 'test_device_edac' device (found at the bluesmoke.sourceforget.net website)
-installs itself as:
-
- /sys/devices/systm/edac/test-instance
-
-in this directory are various controls, a symlink and one or more 'instance'
-directories.
-
-The standard default controls are:
-
- log_ce boolean to log CE events
- log_ue boolean to log UE events
- panic_on_ue boolean to 'panic' the system if an UE is encountered
- (default off, can be set true via startup script)
- poll_msec time period between POLL cycles for events
-
-The test_device_edac device adds at least one of its own custom control:
-
- test_bits which in the current test driver does nothing but
- show how it is installed. A ported driver can
- add one or more such controls and/or attributes
- for specific uses.
- One out-of-tree driver uses controls here to allow
- for ERROR INJECTION operations to hardware
- injection registers
-
-The symlink points to the 'struct dev' that is registered for this edac_device.
-
-INSTANCES
----------
-
-One or more instance directories are present. For the 'test_device_edac' case:
-
- test-instance0
-
-
-In this directory there are two default counter attributes, which are totals of
-counter in deeper subdirectories.
-
- ce_count total of CE events of subdirectories
- ue_count total of UE events of subdirectories
-
-BLOCKS
-------
-
-At the lowest directory level is the 'block' directory. There can be 0, 1
-or more blocks specified in each instance.
-
- test-block0
-
-
-In this directory the default attributes are:
-
- ce_count which is counter of CE events for this 'block'
- of hardware being monitored
- ue_count which is counter of UE events for this 'block'
- of hardware being monitored
-
-
-The 'test_device_edac' device adds 4 attributes and 1 control:
-
- test-block-bits-0 for every POLL cycle this counter
- is incremented
- test-block-bits-1 every 10 cycles, this counter is bumped once,
- and test-block-bits-0 is set to 0
- test-block-bits-2 every 100 cycles, this counter is bumped once,
- and test-block-bits-1 is set to 0
- test-block-bits-3 every 1000 cycles, this counter is bumped once,
- and test-block-bits-2 is set to 0
-
-
- reset-counters writing ANY thing to this control will
- reset all the above counters.
-
-
-Use of the 'test_device_edac' driver should enable any others to create their own
-unique drivers for their hardware systems.
-
-The 'test_device_edac' sample driver is located at the
-bluesmoke.sourceforge.net project site for EDAC.
-
-
-NEHALEM USAGE OF EDAC APIs
---------------------------
-
-This chapter documents some EXPERIMENTAL mappings for EDAC API to handle
-Nehalem EDAC driver. They will likely be changed on future versions
-of the driver.
-
-Due to the way Nehalem exports Memory Controller data, some adjustments
-were done at i7core_edac driver. This chapter will cover those differences
-
-1) On Nehalem, there is one Memory Controller per Quick Patch Interconnect
- (QPI). At the driver, the term "socket" means one QPI. This is
- associated with a physical CPU socket.
-
- Each MC have 3 physical read channels, 3 physical write channels and
- 3 logic channels. The driver currently sees it as just 3 channels.
- Each channel can have up to 3 DIMMs.
-
- The minimum known unity is DIMMs. There are no information about csrows.
- As EDAC API maps the minimum unity is csrows, the driver sequentially
- maps channel/dimm into different csrows.
-
- For example, supposing the following layout:
- Ch0 phy rd0, wr0 (0x063f4031): 2 ranks, UDIMMs
- dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
- dimm 1 1024 Mb offset: 4, bank: 8, rank: 1, row: 0x4000, col: 0x400
- Ch1 phy rd1, wr1 (0x063f4031): 2 ranks, UDIMMs
- dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
- Ch2 phy rd3, wr3 (0x063f4031): 2 ranks, UDIMMs
- dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
- The driver will map it as:
- csrow0: channel 0, dimm0
- csrow1: channel 0, dimm1
- csrow2: channel 1, dimm0
- csrow3: channel 2, dimm0
-
-exports one
- DIMM per csrow.
-
- Each QPI is exported as a different memory controller.
-
-2) Nehalem MC has the ability to generate errors. The driver implements this
- functionality via some error injection nodes:
-
- For injecting a memory error, there are some sysfs nodes, under
- /sys/devices/system/edac/mc/mc?/:
-
- inject_addrmatch/*:
- Controls the error injection mask register. It is possible to specify
- several characteristics of the address to match an error code:
- dimm = the affected dimm. Numbers are relative to a channel;
- rank = the memory rank;
- channel = the channel that will generate an error;
- bank = the affected bank;
- page = the page address;
- column (or col) = the address column.
- each of the above values can be set to "any" to match any valid value.
-
- At driver init, all values are set to any.
-
- For example, to generate an error at rank 1 of dimm 2, for any channel,
- any bank, any page, any column:
- echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm
- echo 1 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank
-
- To return to the default behaviour of matching any, you can do:
- echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm
- echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank
-
- inject_eccmask:
- specifies what bits will have troubles,
-
- inject_section:
- specifies what ECC cache section will get the error:
- 3 for both
- 2 for the highest
- 1 for the lowest
-
- inject_type:
- specifies the type of error, being a combination of the following bits:
- bit 0 - repeat
- bit 1 - ecc
- bit 2 - parity
-
- inject_enable starts the error generation when something different
- than 0 is written.
-
- All inject vars can be read. root permission is needed for write.
-
- Datasheet states that the error will only be generated after a write on an
- address that matches inject_addrmatch. It seems, however, that reading will
- also produce an error.
-
- For example, the following code will generate an error for any write access
- at socket 0, on any DIMM/address on channel 2:
-
- echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/channel
- echo 2 >/sys/devices/system/edac/mc/mc0/inject_type
- echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask
- echo 3 >/sys/devices/system/edac/mc/mc0/inject_section
- echo 1 >/sys/devices/system/edac/mc/mc0/inject_enable
- dd if=/dev/mem of=/dev/null seek=16k bs=4k count=1 >& /dev/null
-
- For socket 1, it is needed to replace "mc0" by "mc1" at the above
- commands.
-
- The generated error message will look like:
-
- EDAC MC0: UE row 0, channel-a= 0 channel-b= 0 labels "-": NON_FATAL (addr = 0x0075b980, socket=0, Dimm=0, Channel=2, syndrome=0x00000040, count=1, Err=8c0000400001009f:4000080482 (read error: read ECC error))
-
-3) Nehalem specific Corrected Error memory counters
-
- Nehalem have some registers to count memory errors. The driver uses those
- registers to report Corrected Errors on devices with Registered Dimms.
-
- However, those counters don't work with Unregistered Dimms. As the chipset
- offers some counters that also work with UDIMMS (but with a worse level of
- granularity than the default ones), the driver exposes those registers for
- UDIMM memories.
-
- They can be read by looking at the contents of all_channel_counts/
-
- $ for i in /sys/devices/system/edac/mc/mc0/all_channel_counts/*; do echo $i; cat $i; done
- /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm0
- 0
- /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm1
- 0
- /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm2
- 0
-
- What happens here is that errors on different csrows, but at the same
- dimm number will increment the same counter.
- So, in this memory mapping:
- csrow0: channel 0, dimm0
- csrow1: channel 0, dimm1
- csrow2: channel 1, dimm0
- csrow3: channel 2, dimm0
- The hardware will increment udimm0 for an error at the first dimm at either
- csrow0, csrow2 or csrow3;
- The hardware will increment udimm1 for an error at the second dimm at either
- csrow0, csrow2 or csrow3;
- The hardware will increment udimm2 for an error at the third dimm at either
- csrow0, csrow2 or csrow3;
-
-4) Standard error counters
-
- The standard error counters are generated when an mcelog error is received
- by the driver. Since, with udimm, this is counted by software, it is
- possible that some errors could be lost. With rdimm's, they display the
- contents of the registers
-
-AMD64_EDAC REFERENCE DOCUMENTS USED
------------------------------------
-amd64_edac module is based on the following documents
-(available from http://support.amd.com/en-us/search/tech-docs):
-
-1. Title: BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD
- Opteron Processors
- AMD publication #: 26094
- Revision: 3.26
- Link: http://support.amd.com/TechDocs/26094.PDF
-
-2. Title: BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh
- Processors
- AMD publication #: 32559
- Revision: 3.00
- Issue Date: May 2006
- Link: http://support.amd.com/TechDocs/32559.pdf
-
-3. Title: BIOS and Kernel Developer's Guide (BKDG) For AMD Family 10h
- Processors
- AMD publication #: 31116
- Revision: 3.00
- Issue Date: September 07, 2007
- Link: http://support.amd.com/TechDocs/31116.pdf
-
-4. Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h
- Models 30h-3Fh Processors
- AMD publication #: 49125
- Revision: 3.06
- Issue Date: 2/12/2015 (latest release)
- Link: http://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf
-
-5. Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h
- Models 60h-6Fh Processors
- AMD publication #: 50742
- Revision: 3.01
- Issue Date: 7/23/2015 (latest release)
- Link: http://support.amd.com/TechDocs/50742_15h_Models_60h-6Fh_BKDG.pdf
-
-6. Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 16h
- Models 00h-0Fh Processors
- AMD publication #: 48751
- Revision: 3.03
- Issue Date: 2/23/2015 (latest release)
- Link: http://support.amd.com/TechDocs/48751_16h_bkdg.pdf
-
-CREDITS:
-========
-
-Written by Doug Thompson <dougthompson@xmission.com>
-7 Dec 2005
-17 Jul 2007 Updated
-
-(c) Mauro Carvalho Chehab
-05 Aug 2009 Nehalem interface
-
-EDAC authors/maintainers:
-
- Doug Thompson, Dave Jiang, Dave Peterson et al,
- Mauro Carvalho Chehab
- Borislav Petkov
- original author: Thayne Harbaugh
diff --git a/MAINTAINERS b/MAINTAINERS
index 299ee500f8fd..3061fa6af855 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4588,7 +4588,8 @@ L: linux-edac@vger.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git for-next
T: git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac.git linux_next
S: Supported
-F: Documentation/edac.txt
+F: Documentation/admin-guide/ras.rst
+F: Documentation/driver-api/edac.rst
F: drivers/edac/
F: include/linux/edac.h
diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
index 6421cc3c7dc1..c5a5b91f37f0 100644
--- a/drivers/edac/altera_edac.c
+++ b/drivers/edac/altera_edac.c
@@ -35,7 +35,6 @@
#include <linux/uaccess.h>
#include "altera_edac.h"
-#include "edac_core.h"
#include "edac_module.h"
#define EDAC_MOD_STR "altera_edac"
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index f14c24d5b140..496603d8f3d2 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -17,7 +17,7 @@
#include <linux/mmzone.h>
#include <linux/edac.h>
#include <asm/msr.h>
-#include "edac_core.h"
+#include "edac_module.h"
#include "mce_amd.h"
#define amd64_debug(fmt, arg...) \
diff --git a/drivers/edac/amd76x_edac.c b/drivers/edac/amd76x_edac.c
index 3a501b530e11..a7450275ad28 100644
--- a/drivers/edac/amd76x_edac.c
+++ b/drivers/edac/amd76x_edac.c
@@ -17,7 +17,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define AMD76X_REVISION " Ver: 2.0.2"
#define EDAC_MOD_STR "amd76x_edac"
diff --git a/drivers/edac/amd8111_edac.c b/drivers/edac/amd8111_edac.c
index 2b63f7c2d6d2..b5786cfded3a 100644
--- a/drivers/edac/amd8111_edac.c
+++ b/drivers/edac/amd8111_edac.c
@@ -29,7 +29,6 @@
#include <linux/pci_ids.h>
#include <asm/io.h>
-#include "edac_core.h"
#include "edac_module.h"
#include "amd8111_edac.h"
diff --git a/drivers/edac/amd8131_edac.c b/drivers/edac/amd8131_edac.c
index a5c680561c73..8851c33d7d24 100644
--- a/drivers/edac/amd8131_edac.c
+++ b/drivers/edac/amd8131_edac.c
@@ -29,7 +29,6 @@
#include <linux/edac.h>
#include <linux/pci_ids.h>
-#include "edac_core.h"
#include "edac_module.h"
#include "amd8131_edac.h"
diff --git a/drivers/edac/cell_edac.c b/drivers/edac/cell_edac.c
index a9259b069dcd..bc1f3416400e 100644
--- a/drivers/edac/cell_edac.c
+++ b/drivers/edac/cell_edac.c
@@ -19,7 +19,7 @@
#include <asm/machdep.h>
#include <asm/cell-regs.h>
-#include "edac_core.h"
+#include "edac_module.h"
struct cell_edac_priv
{
diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c
index 682288ced4ac..837b62c4993d 100644
--- a/drivers/edac/cpc925_edac.c
+++ b/drivers/edac/cpc925_edac.c
@@ -27,7 +27,6 @@
#include <linux/platform_device.h>
#include <linux/gfp.h>
-#include "edac_core.h"
#include "edac_module.h"
#define CPC925_EDAC_REVISION " Ver: 1.0.0"
diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c
index b2d71388172b..1a352cae1f52 100644
--- a/drivers/edac/e752x_edac.c
+++ b/drivers/edac/e752x_edac.c
@@ -24,7 +24,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define E752X_REVISION " Ver: 2.0.2"
#define EDAC_MOD_STR "e752x_edac"
diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c
index ece3aef16bb1..67ef07aed923 100644
--- a/drivers/edac/e7xxx_edac.c
+++ b/drivers/edac/e7xxx_edac.c
@@ -30,7 +30,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define E7XXX_REVISION " Ver: 2.0.2"
#define EDAC_MOD_STR "e7xxx_edac"
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index a97900333e2d..de4d5d08af9e 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -12,23 +12,20 @@
* 19 Jan 2007
*/
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
#include <linux/module.h>
-#include <linux/types.h>
+#include <linux/slab.h>
#include <linux/smp.h>
-#include <linux/init.h>
+#include <linux/spinlock.h>
#include <linux/sysctl.h>
-#include <linux/highmem.h>
#include <linux/timer.h>
-#include <linux/slab.h>
-#include <linux/jiffies.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/ctype.h>
-#include <linux/workqueue.h>
-#include <asm/uaccess.h>
-#include <asm/page.h>
-#include "edac_core.h"
+#include "edac_device.h"
#include "edac_module.h"
/* lock for the list: 'edac_device_list', manipulation of this list
@@ -50,21 +47,6 @@ static void edac_device_dump_device(struct edac_device_ctl_info *edac_dev)
}
#endif /* CONFIG_EDAC_DEBUG */
-
-/*
- * edac_device_alloc_ctl_info()
- * Allocate a new edac device control info structure
- *
- * The control structure is allocated in complete chunk
- * from the OS. It is in turn sub allocated to the
- * various objects that compose the structure
- *
- * The structure has a 'nr_instance' array within itself.
- * Each instance represents a major component
- * Example: L1 cache and L2 cache are 2 instance components
- *
- * Within each instance is an array of 'nr_blocks' blockoffsets
- */
struct edac_device_ctl_info *edac_device_alloc_ctl_info(
unsigned sz_private,
char *edac_device_name, unsigned nr_instances,
@@ -244,11 +226,6 @@ struct edac_device_ctl_info *edac_device_alloc_ctl_info(
}
EXPORT_SYMBOL_GPL(edac_device_alloc_ctl_info);
-/*
- * edac_device_free_ctl_info()
- * frees the memory allocated by the edac_device_alloc_ctl_info()
- * function
- */
void edac_device_free_ctl_info(struct edac_device_ctl_info *ctl_info)
{
edac_device_unregister_sysfs_main_kobj(ctl_info);
@@ -460,12 +437,6 @@ void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev,
edac_mod_work(&edac_dev->work, jiffs);
}
-/*
- * edac_device_alloc_index: Allocate a unique device index number
- *
- * Return:
- * allocated index number
- */
int edac_device_alloc_index(void)
{
static atomic_t device_indexes = ATOMIC_INIT(0);
@@ -474,17 +445,6 @@ int edac_device_alloc_index(void)
}
EXPORT_SYMBOL_GPL(edac_device_alloc_index);
-/**
- * edac_device_add_device: Insert the 'edac_dev' structure into the
- * edac_device global list and create sysfs entries associated with
- * edac_device structure.
- * @edac_device: pointer to the edac_device structure to be added to the list
- * 'edac_device' structure.
- *
- * Return:
- * 0 Success
- * !0 Failure
- */
int edac_device_add_device(struct edac_device_ctl_info *edac_dev)
{
edac_dbg(0, "\n");
@@ -541,19 +501,6 @@ fail0:
}
EXPORT_SYMBOL_GPL(edac_device_add_device);
-/**
- * edac_device_del_device:
- * Remove sysfs entries for specified edac_device structure and
- * then remove edac_device structure from global list
- *
- * @dev:
- * Pointer to 'struct device' representing edac_device
- * structure to remove.
- *
- * Return:
- * Pointer to removed edac_device structure,
- * OR NULL if device not found.
- */
struct edac_device_ctl_info *edac_device_del_device(struct device *dev)
{
struct edac_device_ctl_info *edac_dev;
@@ -608,10 +555,6 @@ static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
return edac_dev->panic_on_ue;
}
-/*
- * edac_device_handle_ce
- * perform a common output and handling of an 'edac_dev' CE event
- */
void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
int inst_nr, int block_nr, const char *msg)
{
@@ -654,10 +597,6 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
}
EXPORT_SYMBOL_GPL(edac_device_handle_ce);
-/*
- * edac_device_handle_ue
- * perform a common output and handling of an 'edac_dev' UE event
- */
void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
int inst_nr, int block_nr, const char *msg)
{
diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_device.h
index 4861542163d7..1aaba74ae411 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_device.h
@@ -1,5 +1,5 @@
/*
- * Defines, structures, APIs for edac_core module
+ * Defines, structures, APIs for edac_device
*
* (C) 2007 Linux Networx (http://lnxi.com)
* This file may be distributed under the terms of the
@@ -15,86 +15,22 @@
* Refactored for multi-source files:
* Doug Thompson <norsk5@xmission.com>
*
+ * Please look at Documentation/driver-api/edac.rst for more info about
+ * EDAC core structs and functions.
*/
-#ifndef _EDAC_CORE_H_
-#define _EDAC_CORE_H_
+#ifndef _EDAC_DEVICE_H_
+#define _EDAC_DEVICE_H_
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/pci.h>
-#include <linux/time.h>
-#include <linux/nmi.h>
-#include <linux/rcupdate.h>
#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/edac.h>
#include <linux/kobject.h>
-#include <linux/platform_device.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/sysfs.h>
#include <linux/workqueue.h>
-#include <linux/edac.h>
-
-#define EDAC_DEVICE_NAME_LEN 31
-#define EDAC_ATTRIB_VALUE_LEN 15
-
-#if PAGE_SHIFT < 20
-#define PAGES_TO_MiB(pages) ((pages) >> (20 - PAGE_SHIFT))
-#define MiB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
-#else /* PAGE_SHIFT > 20 */
-#define PAGES_TO_MiB(pages) ((pages) << (PAGE_SHIFT - 20))
-#define MiB_TO_PAGES(mb) ((mb) >> (PAGE_SHIFT - 20))
-#endif
-
-#define edac_printk(level, prefix, fmt, arg...) \
- printk(level "EDAC " prefix ": " fmt, ##arg)
-
-#define edac_mc_printk(mci, level, fmt, arg...) \
- printk(level "EDAC MC%d: " fmt, mci->mc_idx, ##arg)
-
-#define edac_mc_chipset_printk(mci, level, prefix, fmt, arg...) \
- printk(level "EDAC " prefix " MC%d: " fmt, mci->mc_idx, ##arg)
-
-#define edac_device_printk(ctl, level, fmt, arg...) \
- printk(level "EDAC DEVICE%d: " fmt, ctl->dev_idx, ##arg)
-
-#define edac_pci_printk(ctl, level, fmt, arg...) \
- printk(level "EDAC PCI%d: " fmt, ctl->pci_idx, ##arg)
-/* prefixes for edac_printk() and edac_mc_printk() */
-#define EDAC_MC "MC"
-#define EDAC_PCI "PCI"
-#define EDAC_DEBUG "DEBUG"
-
-extern const char * const edac_mem_types[];
-
-#ifdef CONFIG_EDAC_DEBUG
-extern int edac_debug_level;
-
-#define edac_dbg(level, fmt, ...) \
-do { \
- if (level <= edac_debug_level) \
- edac_printk(KERN_DEBUG, EDAC_DEBUG, \
- "%s: " fmt, __func__, ##__VA_ARGS__); \
-} while (0)
-
-#else /* !CONFIG_EDAC_DEBUG */
-
-#define edac_dbg(level, fmt, ...) \
-do { \
- if (0) \
- edac_printk(KERN_DEBUG, EDAC_DEBUG, \
- "%s: " fmt, __func__, ##__VA_ARGS__); \
-} while (0)
-
-#endif /* !CONFIG_EDAC_DEBUG */
-
-#define PCI_VEND_DEV(vend, dev) PCI_VENDOR_ID_ ## vend, \
- PCI_DEVICE_ID_ ## vend ## _ ## dev
-
-#define edac_dev_name(dev) (dev)->dev_name
-
-#define to_mci(k) container_of(k, struct mem_ctl_info, dev)
/*
* The following are the structures to provide for a generic
@@ -321,197 +257,64 @@ extern struct edac_device_ctl_info *edac_device_alloc_ctl_info(
extern void edac_device_free_ctl_info(struct edac_device_ctl_info *ctl_info);
-#ifdef CONFIG_PCI
-
-struct edac_pci_counter {
- atomic_t pe_count;
- atomic_t npe_count;
-};
-
-/*
- * Abstract edac_pci control info structure
+/**
+ * edac_device_add_device: Insert the 'edac_dev' structure into the
+ * edac_device global list and create sysfs entries associated with
+ * edac_device structure.
+ *
+ * @edac_dev: pointer to edac_device structure to be added to the list
+ * 'edac_device' structure.
*
+ * Returns:
+ * 0 on Success, or an error code on failure
*/
-struct edac_pci_ctl_info {
- /* for global list of edac_pci_ctl_info structs */
- struct list_head link;
-
- int pci_idx;
-
- struct bus_type *edac_subsys; /* pointer to subsystem */
-
- /* the internal state of this controller instance */
- int op_state;
- /* work struct for this instance */
- struct delayed_work work;
-
- /* pointer to edac polling checking routine:
- * If NOT NULL: points to polling check routine
- * If NULL: Then assumes INTERRUPT operation, where
- * MC driver will receive events
- */
- void (*edac_check) (struct edac_pci_ctl_info * edac_dev);
-
- struct device *dev; /* pointer to device structure */
-
- const char *mod_name; /* module name */
- const char *ctl_name; /* edac controller name */
- const char *dev_name; /* pci/platform/etc... name */
-
- void *pvt_info; /* pointer to 'private driver' info */
-
- unsigned long start_time; /* edac_pci load start time (jiffies) */
-
- struct completion complete;
-
- /* sysfs top name under 'edac' directory
- * and instance name:
- * cpu/cpu0/...
- * cpu/cpu1/...
- * cpu/cpu2/...
- * ...
- */
- char name[EDAC_DEVICE_NAME_LEN + 1];
-
- /* Event counters for the this whole EDAC Device */
- struct edac_pci_counter counters;
-
- /* edac sysfs device control for the 'name'
- * device this structure controls
- */
- struct kobject kobj;
- struct completion kobj_complete;
-};
-
-#define to_edac_pci_ctl_work(w) \
- container_of(w, struct edac_pci_ctl_info,work)
-
-/* write all or some bits in a byte-register*/
-static inline void pci_write_bits8(struct pci_dev *pdev, int offset, u8 value,
- u8 mask)
-{
- if (mask != 0xff) {
- u8 buf;
-
- pci_read_config_byte(pdev, offset, &buf);
- value &= mask;
- buf &= ~mask;
- value |= buf;
- }
-
- pci_write_config_byte(pdev, offset, value);
-}
-
-/* write all or some bits in a word-register*/
-static inline void pci_write_bits16(struct pci_dev *pdev, int offset,
- u16 value, u16 mask)
-{
- if (mask != 0xffff) {
- u16 buf;
-
- pci_read_config_word(pdev, offset, &buf);
- value &= mask;
- buf &= ~mask;
- value |= buf;
- }
-
- pci_write_config_word(pdev, offset, value);
-}
+extern int edac_device_add_device(struct edac_device_ctl_info *edac_dev);
-/*
- * pci_write_bits32
+/**
+ * edac_device_del_device:
+ * Remove sysfs entries for specified edac_device structure and
+ * then remove edac_device structure from global list
*
- * edac local routine to do pci_write_config_dword, but adds
- * a mask parameter. If mask is all ones, ignore the mask.
- * Otherwise utilize the mask to isolate specified bits
+ * @dev:
+ * Pointer to struct &device representing the edac device
+ * structure to remove.
*
- * write all or some bits in a dword-register
+ * Returns:
+ * Pointer to removed edac_device structure,
+ * or %NULL if device not found.
*/
-static inline void pci_write_bits32(struct pci_dev *pdev, int offset,
- u32 value, u32 mask)
-{
- if (mask != 0xffffffff) {
- u32 buf;
-
- pci_read_config_dword(pdev, offset, &buf);
- value &= mask;
- buf &= ~mask;
- value |= buf;
- }
-
- pci_write_config_dword(pdev, offset, value);
-}
-
-#endif /* CONFIG_PCI */
-
-struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
- unsigned n_layers,
- struct edac_mc_layer *layers,
- unsigned sz_pvt);
-extern int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
- const struct attribute_group **groups);
-#define edac_mc_add_mc(mci) edac_mc_add_mc_with_groups(mci, NULL)
-extern void edac_mc_free(struct mem_ctl_info *mci);
-extern struct mem_ctl_info *edac_mc_find(int idx);
-extern struct mem_ctl_info *find_mci_by_dev(struct device *dev);
-extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev);
-extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
- unsigned long page);
-
-void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
- struct mem_ctl_info *mci,
- struct edac_raw_error_desc *e);
-
-void edac_mc_handle_error(const enum hw_event_mc_err_type type,
- struct mem_ctl_info *mci,
- const u16 error_count,
- const unsigned long page_frame_number,
- const unsigned long offset_in_page,
- const unsigned long syndrome,
- const int top_layer,
- const int mid_layer,
- const int low_layer,
- const char *msg,
- const char *other_detail);
+extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
-/*
- * edac_device APIs
+/**
+ * edac_device_handle_ue():
+ * perform a common output and handling of an 'edac_dev' UE event
+ *
+ * @edac_dev: pointer to struct &edac_device_ctl_info
+ * @inst_nr: number of the instance where the UE error happened
+ * @block_nr: number of the block where the UE error happened
+ * @msg: message to be printed
*/
-extern int edac_device_add_device(struct edac_device_ctl_info *edac_dev);
-extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
int inst_nr, int block_nr, const char *msg);
+/**
+ * edac_device_handle_ce():
+ * perform a common output and handling of an 'edac_dev' CE event
+ *
+ * @edac_dev: pointer to struct &edac_device_ctl_info
+ * @inst_nr: number of the instance where the CE error happened
+ * @block_nr: number of the block where the CE error happened
+ * @msg: message to be printed
+ */
extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
int inst_nr, int block_nr, const char *msg);
-extern int edac_device_alloc_index(void);
-extern const char *edac_layer_name[];
-/*
- * edac_pci APIs
- */
-extern struct edac_pci_ctl_info *edac_pci_alloc_ctl_info(unsigned int sz_pvt,
- const char *edac_pci_name);
-
-extern void edac_pci_free_ctl_info(struct edac_pci_ctl_info *pci);
-
-extern void edac_pci_reset_delay_period(struct edac_pci_ctl_info *pci,
- unsigned long value);
-
-extern int edac_pci_alloc_index(void);
-extern int edac_pci_add_device(struct edac_pci_ctl_info *pci, int edac_idx);
-extern struct edac_pci_ctl_info *edac_pci_del_device(struct device *dev);
-
-extern struct edac_pci_ctl_info *edac_pci_create_generic_ctl(
- struct device *dev,
- const char *mod_name);
-
-extern void edac_pci_release_generic_ctl(struct edac_pci_ctl_info *pci);
-extern int edac_pci_create_sysfs(struct edac_pci_ctl_info *pci);
-extern void edac_pci_remove_sysfs(struct edac_pci_ctl_info *pci);
-
-/*
- * edac misc APIs
+/**
+ * edac_device_alloc_index: Allocate a unique device index number
+ *
+ * Returns:
+ * allocated index number
*/
-extern char *edac_op_state_to_string(int op_state);
+extern int edac_device_alloc_index(void);
+extern const char *edac_layer_name[];
-#endif /* _EDAC_CORE_H_ */
+#endif
diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c
index 93da1a45c716..0e7ea3591b78 100644
--- a/drivers/edac/edac_device_sysfs.c
+++ b/drivers/edac/edac_device_sysfs.c
@@ -1,7 +1,7 @@
/*
* file for managing the edac_device subsystem of devices for EDAC
*
- * (C) 2007 SoftwareBitMaker
+ * (C) 2007 SoftwareBitMaker
*
* This file may be distributed under the terms of the
* GNU General Public License.
@@ -15,7 +15,7 @@
#include <linux/slab.h>
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_device.h"
#include "edac_module.h"
#define EDAC_DEVICE_SYMLINK "device"
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index d2ea9c4f1824..5f2c717f8053 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -30,7 +30,7 @@
#include <linux/bitops.h>
#include <asm/uaccess.h>
#include <asm/page.h>
-#include "edac_core.h"
+#include "edac_mc.h"
#include "edac_module.h"
#include <ras/ras_event.h>
@@ -239,30 +239,6 @@ static void _edac_mc_free(struct mem_ctl_info *mci)
kfree(mci);
}
-/**
- * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
- * @mc_num: Memory controller number
- * @n_layers: Number of MC hierarchy layers
- * layers: Describes each layer as seen by the Memory Controller
- * @size_pvt: size of private storage needed
- *
- *
- * Everything is kmalloc'ed as one big chunk - more efficient.
- * Only can be used if all structures have the same lifetime - otherwise
- * you have to allocate and initialize your own structures.
- *
- * Use edac_mc_free() to free mc structures allocated by this function.
- *
- * NOTE: drivers handle multi-rank memories in different ways: in some
- * drivers, one multi-rank memory stick is mapped as one entry, while, in
- * others, a single multi-rank memory stick would be mapped into several
- * entries. Currently, this function will allocate multiple struct dimm_info
- * on such scenarios, as grouping the multiple ranks require drivers change.
- *
- * Returns:
- * On failure: NULL
- * On success: struct mem_ctl_info pointer
- */
struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
unsigned n_layers,
struct edac_mc_layer *layers,
@@ -460,11 +436,6 @@ error:
}
EXPORT_SYMBOL_GPL(edac_mc_alloc);
-/**
- * edac_mc_free
- * 'Free' a previously allocated 'mci' structure
- * @mci: pointer to a struct mem_ctl_info structure
- */
void edac_mc_free(struct mem_ctl_info *mci)
{
edac_dbg(1, "\n");
@@ -646,12 +617,6 @@ static int del_mc_from_global_list(struct mem_ctl_info *mci)
return handlers;
}
-/**
- * edac_mc_find: Search for a mem_ctl_info structure whose index is 'idx'.
- *
- * If found, return a pointer to the structure.
- * Else return NULL.
- */
struct mem_ctl_info *edac_mc_find(int idx)
{
struct mem_ctl_info *mci = NULL;
@@ -676,16 +641,6 @@ unlock:
}
EXPORT_SYMBOL(edac_mc_find);
-/**
- * edac_mc_add_mc_with_groups: Insert the 'mci' structure into the mci
- * global list and create sysfs entries associated with mci structure
- * @mci: pointer to the mci structure to be added to the list
- * @groups: optional attribute groups for the driver-specific sysfs entries
- *
- * Return:
- * 0 Success
- * !0 Failure
- */
/* FIXME - should a warning be printed if no error detection? correction? */
int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
@@ -776,13 +731,6 @@ fail0:
}
EXPORT_SYMBOL_GPL(edac_mc_add_mc_with_groups);
-/**
- * edac_mc_del_mc: Remove sysfs entries for specified mci structure and
- * remove mci structure from global list
- * @pdev: Pointer to 'struct device' representing mci structure to remove.
- *
- * Return pointer to removed mci structure, or NULL if device not found.
- */
struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
{
struct mem_ctl_info *mci;
@@ -1046,18 +994,6 @@ static void edac_ue_error(struct mem_ctl_info *mci,
edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
}
-/**
- * edac_raw_mc_handle_error - reports a memory event to userspace without doing
- * anything to discover the error location
- *
- * @type: severity of the error (CE/UE/Fatal)
- * @mci: a struct mem_ctl_info pointer
- * @e: error description
- *
- * This raw function is used internally by edac_mc_handle_error(). It should
- * only be called directly when the hardware error come directly from BIOS,
- * like in the case of APEI GHES driver.
- */
void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
struct edac_raw_error_desc *e)
@@ -1087,24 +1023,6 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
}
EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error);
-/**
- * edac_mc_handle_error - reports a memory event to userspace
- *
- * @type: severity of the error (CE/UE/Fatal)
- * @mci: a struct mem_ctl_info pointer
- * @error_count: Number of errors of the same type
- * @page_frame_number: mem page where the error occurred
- * @offset_in_page: offset of the error inside the page
- * @syndrome: ECC syndrome
- * @top_layer: Memory layer[0] position
- * @mid_layer: Memory layer[1] position
- * @low_layer: Memory layer[2] position
- * @msg: Message meaningful to the end users that
- * explains the event
- * @other_detail: Technical details about the event that
- * may help hardware manufacturers and
- * EDAC developers to analyse the event
- */
void edac_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
const u16 error_count,
diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
new file mode 100644
index 000000000000..50fc1dc9c0d8
--- /dev/null
+++ b/drivers/edac/edac_mc.h
@@ -0,0 +1,245 @@
+/*
+ * Defines, structures, APIs for edac_mc module
+ *
+ * (C) 2007 Linux Networx (http://lnxi.com)
+ * This file may be distributed under the terms of the
+ * GNU General Public License.
+ *
+ * Written by Thayne Harbaugh
+ * Based on work by Dan Hollis <goemon at anime dot net> and others.
+ * http://www.anime.net/~goemon/linux-ecc/
+ *
+ * NMI handling support added by
+ * Dave Peterson <dsp@llnl.gov> <dave_peterson@pobox.com>
+ *
+ * Refactored for multi-source files:
+ * Doug Thompson <norsk5@xmission.com>
+ *
+ * Please look at Documentation/driver-api/edac.rst for more info about
+ * EDAC core structs and functions.
+ */
+
+#ifndef _EDAC_MC_H_
+#define _EDAC_MC_H_
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/pci.h>
+#include <linux/time.h>
+#include <linux/nmi.h>
+#include <linux/rcupdate.h>
+#include <linux/completion.h>
+#include <linux/kobject.h>
+#include <linux/platform_device.h>
+#include <linux/workqueue.h>
+#include <linux/edac.h>
+
+#if PAGE_SHIFT < 20
+#define PAGES_TO_MiB(pages) ((pages) >> (20 - PAGE_SHIFT))
+#define MiB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
+#else /* PAGE_SHIFT > 20 */
+#define PAGES_TO_MiB(pages) ((pages) << (PAGE_SHIFT - 20))
+#define MiB_TO_PAGES(mb) ((mb) >> (PAGE_SHIFT - 20))
+#endif
+
+#define edac_printk(level, prefix, fmt, arg...) \
+ printk(level "EDAC " prefix ": " fmt, ##arg)
+
+#define edac_mc_printk(mci, level, fmt, arg...) \
+ printk(level "EDAC MC%d: " fmt, mci->mc_idx, ##arg)
+
+#define edac_mc_chipset_printk(mci, level, prefix, fmt, arg...) \
+ printk(level "EDAC " prefix " MC%d: " fmt, mci->mc_idx, ##arg)
+
+#define edac_device_printk(ctl, level, fmt, arg...) \
+ printk(level "EDAC DEVICE%d: " fmt, ctl->dev_idx, ##arg)
+
+#define edac_pci_printk(ctl, level, fmt, arg...) \
+ printk(level "EDAC PCI%d: " fmt, ctl->pci_idx, ##arg)
+
+/* prefixes for edac_printk() and edac_mc_printk() */
+#define EDAC_MC "MC"
+#define EDAC_PCI "PCI"
+#define EDAC_DEBUG "DEBUG"
+
+extern const char * const edac_mem_types[];
+
+#ifdef CONFIG_EDAC_DEBUG
+extern int edac_debug_level;
+
+#define edac_dbg(level, fmt, ...) \
+do { \
+ if (level <= edac_debug_level) \
+ edac_printk(KERN_DEBUG, EDAC_DEBUG, \
+ "%s: " fmt, __func__, ##__VA_ARGS__); \
+} while (0)
+
+#else /* !CONFIG_EDAC_DEBUG */
+
+#define edac_dbg(level, fmt, ...) \
+do { \
+ if (0) \
+ edac_printk(KERN_DEBUG, EDAC_DEBUG, \
+ "%s: " fmt, __func__, ##__VA_ARGS__); \
+} while (0)
+
+#endif /* !CONFIG_EDAC_DEBUG */
+
+#define PCI_VEND_DEV(vend, dev) PCI_VENDOR_ID_ ## vend, \
+ PCI_DEVICE_ID_ ## vend ## _ ## dev
+
+#define edac_dev_name(dev) (dev)->dev_name
+
+#define to_mci(k) container_of(k, struct mem_ctl_info, dev)
+
+/**
+ * edac_mc_alloc() - Allocate and partially fill a struct &mem_ctl_info.
+ *
+ * @mc_num: Memory controller number
+ * @n_layers: Number of MC hierarchy layers
+ * @layers: Describes each layer as seen by the Memory Controller
+ * @sz_pvt: size of private storage needed
+ *
+ *
+ * Everything is kmalloc'ed as one big chunk - more efficient.
+ * Only can be used if all structures have the same lifetime - otherwise
+ * you have to allocate and initialize your own structures.
+ *
+ * Use edac_mc_free() to free mc structures allocated by this function.
+ *
+ * .. note::
+ *
+ * drivers handle multi-rank memories in different ways: in some
+ * drivers, one multi-rank memory stick is mapped as one entry, while, in
+ * others, a single multi-rank memory stick would be mapped into several
+ * entries. Currently, this function will allocate multiple struct dimm_info
+ * on such scenarios, as grouping the multiple ranks require drivers change.
+ *
+ * Returns:
+ * On success, return a pointer to struct mem_ctl_info pointer;
+ * %NULL otherwise
+ */
+struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
+ unsigned n_layers,
+ struct edac_mc_layer *layers,
+ unsigned sz_pvt);
+
+/**
+ * edac_mc_add_mc_with_groups() - Insert the @mci structure into the mci
+ * global list and create sysfs entries associated with @mci structure.
+ *
+ * @mci: pointer to the mci structure to be added to the list
+ * @groups: optional attribute groups for the driver-specific sysfs entries
+ *
+ * Returns:
+ * 0 on Success, or an error code on failure
+ */
+extern int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci,
+ const struct attribute_group **groups);
+#define edac_mc_add_mc(mci) edac_mc_add_mc_with_groups(mci, NULL)
+
+/**
+ * edac_mc_free() - Frees a previously allocated @mci structure
+ *
+ * @mci: pointer to a struct mem_ctl_info structure
+ */
+extern void edac_mc_free(struct mem_ctl_info *mci);
+
+/**
+ * edac_mc_find() - Search for a mem_ctl_info structure whose index is @idx.
+ *
+ * @idx: index to be seek
+ *
+ * If found, return a pointer to the structure.
+ * Else return NULL.
+ */
+extern struct mem_ctl_info *edac_mc_find(int idx);
+
+/**
+ * find_mci_by_dev() - Scan list of controllers looking for the one that
+ * manages the @dev device.
+ *
+ * @dev: pointer to a struct device related with the MCI
+ *
+ * Returns: on success, returns a pointer to struct &mem_ctl_info;
+ * %NULL otherwise.
+ */
+extern struct mem_ctl_info *find_mci_by_dev(struct device *dev);
+
+/**
+ * edac_mc_del_mc() - Remove sysfs entries for mci structure associated with
+ * @dev and remove mci structure from global list.
+ *
+ * @dev: Pointer to struct &device representing mci structure to remove.
+ *
+ * Returns: pointer to removed mci structure, or %NULL if device not found.
+ */
+extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev);
+
+/**
+ * edac_mc_find_csrow_by_page() - Ancillary routine to identify what csrow
+ * contains a memory page.
+ *
+ * @mci: pointer to a struct mem_ctl_info structure
+ * @page: memory page to find
+ *
+ * Returns: on success, returns the csrow. -1 if not found.
+ */
+extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
+ unsigned long page);
+
+/**
+ * edac_raw_mc_handle_error() - Reports a memory event to userspace without
+ * doing anything to discover the error location.
+ *
+ * @type: severity of the error (CE/UE/Fatal)
+ * @mci: a struct mem_ctl_info pointer
+ * @e: error description
+ *
+ * This raw function is used internally by edac_mc_handle_error(). It should
+ * only be called directly when the hardware error come directly from BIOS,
+ * like in the case of APEI GHES driver.
+ */
+void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
+ struct mem_ctl_info *mci,
+ struct edac_raw_error_desc *e);
+
+/**
+ * edac_mc_handle_error() - Reports a memory event to userspace.
+ *
+ * @type: severity of the error (CE/UE/Fatal)
+ * @mci: a struct mem_ctl_info pointer
+ * @error_count: Number of errors of the same type
+ * @page_frame_number: mem page where the error occurred
+ * @offset_in_page: offset of the error inside the page
+ * @syndrome: ECC syndrome
+ * @top_layer: Memory layer[0] position
+ * @mid_layer: Memory layer[1] position
+ * @low_layer: Memory layer[2] position
+ * @msg: Message meaningful to the end users that
+ * explains the event
+ * @other_detail: Technical details about the event that
+ * may help hardware manufacturers and
+ * EDAC developers to analyse the event
+ */
+void edac_mc_handle_error(const enum hw_event_mc_err_type type,
+ struct mem_ctl_info *mci,
+ const u16 error_count,
+ const unsigned long page_frame_number,
+ const unsigned long offset_in_page,
+ const unsigned long syndrome,
+ const int top_layer,
+ const int mid_layer,
+ const int low_layer,
+ const char *msg,
+ const char *other_detail);
+
+/*
+ * edac misc APIs
+ */
+extern char *edac_op_state_to_string(int op_state);
+
+#endif /* _EDAC_MC_H_ */
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 4e0f8e720ad9..39dbab7d62f1 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -19,7 +19,7 @@
#include <linux/pm_runtime.h>
#include <linux/uaccess.h>
-#include "edac_core.h"
+#include "edac_mc.h"
#include "edac_module.h"
/* MC EDAC Controls, setable by module parameter, and sysfs */
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index 5f8543be995a..172598a27d7d 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -12,7 +12,7 @@
*/
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_mc.h"
#include "edac_module.h"
#define EDAC_VERSION "Ver: 3.0.0"
diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h
index cfaacb99c973..014871e169cc 100644
--- a/drivers/edac/edac_module.h
+++ b/drivers/edac/edac_module.h
@@ -10,7 +10,9 @@
#ifndef __EDAC_MODULE_H__
#define __EDAC_MODULE_H__
-#include "edac_core.h"
+#include "edac_mc.h"
+#include "edac_pci.h"
+#include "edac_device.h"
/*
* INTERNAL EDAC MODULE:
diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c
index 8f2f2899a7a2..4e9d5632041a 100644
--- a/drivers/edac/edac_pci.c
+++ b/drivers/edac/edac_pci.c
@@ -9,35 +9,25 @@
* or implied.
*
*/
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
#include <linux/module.h>
-#include <linux/types.h>
+#include <linux/slab.h>
#include <linux/smp.h>
-#include <linux/init.h>
+#include <linux/spinlock.h>
#include <linux/sysctl.h>
-#include <linux/highmem.h>
#include <linux/timer.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/ctype.h>
-#include <linux/workqueue.h>
-#include <asm/uaccess.h>
-#include <asm/page.h>
-#include "edac_core.h"
+#include "edac_pci.h"
#include "edac_module.h"
static DEFINE_MUTEX(edac_pci_ctls_mutex);
static LIST_HEAD(edac_pci_list);
static atomic_t pci_indexes = ATOMIC_INIT(0);
-/*
- * edac_pci_alloc_ctl_info
- *
- * The alloc() function for the 'edac_pci' control info
- * structure. The chip driver will allocate one of these for each
- * edac_pci it is going to control/register with the EDAC CORE.
- */
struct edac_pci_ctl_info *edac_pci_alloc_ctl_info(unsigned int sz_pvt,
const char *edac_pci_name)
{
@@ -68,16 +58,6 @@ struct edac_pci_ctl_info *edac_pci_alloc_ctl_info(unsigned int sz_pvt,
}
EXPORT_SYMBOL_GPL(edac_pci_alloc_ctl_info);
-/*
- * edac_pci_free_ctl_info()
- *
- * Last action on the pci control structure.
- *
- * call the remove sysfs information, which will unregister
- * this control struct's kobj. When that kobj's ref count
- * goes to zero, its release function will be call and then
- * kfree() the memory.
- */
void edac_pci_free_ctl_info(struct edac_pci_ctl_info *pci)
{
edac_dbg(1, "\n");
@@ -215,31 +195,12 @@ static void edac_pci_workq_function(struct work_struct *work_req)
mutex_unlock(&edac_pci_ctls_mutex);
}
-/*
- * edac_pci_alloc_index: Allocate a unique PCI index number
- *
- * Return:
- * allocated index number
- *
- */
int edac_pci_alloc_index(void)
{
return atomic_inc_return(&pci_indexes) - 1;
}
EXPORT_SYMBOL_GPL(edac_pci_alloc_index);
-/*
- * edac_pci_add_device: Insert the 'edac_dev' structure into the
- * edac_pci global list and create sysfs entries associated with
- * edac_pci structure.
- * @pci: pointer to the edac_device structure to be added to the list
- * @edac_idx: A unique numeric identifier to be assigned to the
- * 'edac_pci' structure.
- *
- * Return:
- * 0 Success
- * !0 Failure
- */
int edac_pci_add_device(struct edac_pci_ctl_info *pci, int edac_idx)
{
edac_dbg(0, "\n");
@@ -285,19 +246,6 @@ fail0:
}
EXPORT_SYMBOL_GPL(edac_pci_add_device);
-/*
- * edac_pci_del_device()
- * Remove sysfs entries for specified edac_pci structure and
- * then remove edac_pci structure from global list
- *
- * @dev:
- * Pointer to 'struct device' representing edac_pci structure
- * to remove
- *
- * Return:
- * Pointer to removed edac_pci structure,
- * or NULL if device not found
- */
struct edac_pci_ctl_info *edac_pci_del_device(struct device *dev)
{
struct edac_pci_ctl_info *pci;
@@ -351,17 +299,6 @@ struct edac_pci_gen_data {
int edac_idx;
};
-/*
- * edac_pci_create_generic_ctl
- *
- * A generic constructor for a PCI parity polling device
- * Some systems have more than one domain of PCI busses.
- * For systems with one domain, then this API will
- * provide for a generic poller.
- *
- * This routine calls the edac_pci_alloc_ctl_info() for
- * the generic device, with default values
- */
struct edac_pci_ctl_info *edac_pci_create_generic_ctl(struct device *dev,
const char *mod_name)
{
@@ -394,11 +331,6 @@ struct edac_pci_ctl_info *edac_pci_create_generic_ctl(struct device *dev,
}
EXPORT_SYMBOL_GPL(edac_pci_create_generic_ctl);
-/*
- * edac_pci_release_generic_ctl
- *
- * The release function of a generic EDAC PCI polling device
- */
void edac_pci_release_generic_ctl(struct edac_pci_ctl_info *pci)
{
edac_dbg(0, "pci mod=%s\n", pci->mod_name);
diff --git a/drivers/edac/edac_pci.h b/drivers/edac/edac_pci.h
new file mode 100644
index 000000000000..5175f5724cfa
--- /dev/null
+++ b/drivers/edac/edac_pci.h
@@ -0,0 +1,271 @@
+/*
+ * Defines, structures, APIs for edac_pci and edac_pci_sysfs
+ *
+ * (C) 2007 Linux Networx (http://lnxi.com)
+ * This file may be distributed under the terms of the
+ * GNU General Public License.
+ *
+ * Written by Thayne Harbaugh
+ * Based on work by Dan Hollis <goemon at anime dot net> and others.
+ * http://www.anime.net/~goemon/linux-ecc/
+ *
+ * NMI handling support added by
+ * Dave Peterson <dsp@llnl.gov> <dave_peterson@pobox.com>
+ *
+ * Refactored for multi-source files:
+ * Doug Thompson <norsk5@xmission.com>
+ *
+ * Please look at Documentation/driver-api/edac.rst for more info about
+ * EDAC core structs and functions.
+ */
+
+#ifndef _EDAC_PCI_H_
+#define _EDAC_PCI_H_
+
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/edac.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#ifdef CONFIG_PCI
+
+struct edac_pci_counter {
+ atomic_t pe_count;
+ atomic_t npe_count;
+};
+
+/*
+ * Abstract edac_pci control info structure
+ *
+ */
+struct edac_pci_ctl_info {
+ /* for global list of edac_pci_ctl_info structs */
+ struct list_head link;
+
+ int pci_idx;
+
+ struct bus_type *edac_subsys; /* pointer to subsystem */
+
+ /* the internal state of this controller instance */
+ int op_state;
+ /* work struct for this instance */
+ struct delayed_work work;
+
+ /* pointer to edac polling checking routine:
+ * If NOT NULL: points to polling check routine
+ * If NULL: Then assumes INTERRUPT operation, where
+ * MC driver will receive events
+ */
+ void (*edac_check) (struct edac_pci_ctl_info * edac_dev);
+
+ struct device *dev; /* pointer to device structure */
+
+ const char *mod_name; /* module name */
+ const char *ctl_name; /* edac controller name */
+ const char *dev_name; /* pci/platform/etc... name */
+
+ void *pvt_info; /* pointer to 'private driver' info */
+
+ unsigned long start_time; /* edac_pci load start time (jiffies) */
+
+ struct completion complete;
+
+ /* sysfs top name under 'edac' directory
+ * and instance name:
+ * cpu/cpu0/...
+ * cpu/cpu1/...
+ * cpu/cpu2/...
+ * ...
+ */
+ char name[EDAC_DEVICE_NAME_LEN + 1];
+
+ /* Event counters for the this whole EDAC Device */
+ struct edac_pci_counter counters;
+
+ /* edac sysfs device control for the 'name'
+ * device this structure controls
+ */
+ struct kobject kobj;
+};
+
+#define to_edac_pci_ctl_work(w) \
+ container_of(w, struct edac_pci_ctl_info,work)
+
+/* write all or some bits in a byte-register*/
+static inline void pci_write_bits8(struct pci_dev *pdev, int offset, u8 value,
+ u8 mask)
+{
+ if (mask != 0xff) {
+ u8 buf;
+
+ pci_read_config_byte(pdev, offset, &buf);
+ value &= mask;
+ buf &= ~mask;
+ value |= buf;
+ }
+
+ pci_write_config_byte(pdev, offset, value);
+}
+
+/* write all or some bits in a word-register*/
+static inline void pci_write_bits16(struct pci_dev *pdev, int offset,
+ u16 value, u16 mask)
+{
+ if (mask != 0xffff) {
+ u16 buf;
+
+ pci_read_config_word(pdev, offset, &buf);
+ value &= mask;
+ buf &= ~mask;
+ value |= buf;
+ }
+
+ pci_write_config_word(pdev, offset, value);
+}
+
+/*
+ * pci_write_bits32
+ *
+ * edac local routine to do pci_write_config_dword, but adds
+ * a mask parameter. If mask is all ones, ignore the mask.
+ * Otherwise utilize the mask to isolate specified bits
+ *
+ * write all or some bits in a dword-register
+ */
+static inline void pci_write_bits32(struct pci_dev *pdev, int offset,
+ u32 value, u32 mask)
+{
+ if (mask != 0xffffffff) {
+ u32 buf;
+
+ pci_read_config_dword(pdev, offset, &buf);
+ value &= mask;
+ buf &= ~mask;
+ value |= buf;
+ }
+
+ pci_write_config_dword(pdev, offset, value);
+}
+
+#endif /* CONFIG_PCI */
+
+/*
+ * edac_pci APIs
+ */
+
+/**
+ * edac_pci_alloc_ctl_info:
+ * The alloc() function for the 'edac_pci' control info
+ * structure.
+ *
+ * @sz_pvt: size of the private info at struct &edac_pci_ctl_info
+ * @edac_pci_name: name of the PCI device
+ *
+ * The chip driver will allocate one of these for each
+ * edac_pci it is going to control/register with the EDAC CORE.
+ *
+ * Returns: a pointer to struct &edac_pci_ctl_info on success; %NULL otherwise.
+ */
+extern struct edac_pci_ctl_info *edac_pci_alloc_ctl_info(unsigned int sz_pvt,
+ const char *edac_pci_name);
+
+/**
+ * edac_pci_free_ctl_info():
+ * Last action on the pci control structure.
+ *
+ * @pci: pointer to struct &edac_pci_ctl_info
+ *
+ * Calls the remove sysfs information, which will unregister
+ * this control struct's kobj. When that kobj's ref count
+ * goes to zero, its release function will be call and then
+ * kfree() the memory.
+ */
+extern void edac_pci_free_ctl_info(struct edac_pci_ctl_info *pci);
+
+/**
+ * edac_pci_alloc_index: Allocate a unique PCI index number
+ *
+ * Returns:
+ * allocated index number
+ *
+ */
+extern int edac_pci_alloc_index(void);
+
+/**
+ * edac_pci_add_device(): Insert the 'edac_dev' structure into the
+ * edac_pci global list and create sysfs entries associated with
+ * edac_pci structure.
+ *
+ * @pci: pointer to the edac_device structure to be added to the list
+ * @edac_idx: A unique numeric identifier to be assigned to the
+ * 'edac_pci' structure.
+ *
+ * Returns:
+ * 0 on Success, or an error code on failure
+ */
+extern int edac_pci_add_device(struct edac_pci_ctl_info *pci, int edac_idx);
+
+/**
+ * edac_pci_del_device()
+ * Remove sysfs entries for specified edac_pci structure and
+ * then remove edac_pci structure from global list
+ *
+ * @dev:
+ * Pointer to 'struct device' representing edac_pci structure
+ * to remove
+ *
+ * Returns:
+ * Pointer to removed edac_pci structure,
+ * or %NULL if device not found
+ */
+extern struct edac_pci_ctl_info *edac_pci_del_device(struct device *dev);
+
+/**
+ * edac_pci_create_generic_ctl()
+ * A generic constructor for a PCI parity polling device
+ * Some systems have more than one domain of PCI busses.
+ * For systems with one domain, then this API will
+ * provide for a generic poller.
+ *
+ * @dev: pointer to struct &device;
+ * @mod_name: name of the PCI device
+ *
+ * This routine calls the edac_pci_alloc_ctl_info() for
+ * the generic device, with default values
+ *
+ * Returns: Pointer to struct &edac_pci_ctl_info on success, %NULL on
+ * failure.
+ */
+extern struct edac_pci_ctl_info *edac_pci_create_generic_ctl(
+ struct device *dev,
+ const char *mod_name);
+
+/**
+ * edac_pci_release_generic_ctl
+ * The release function of a generic EDAC PCI polling device
+ *
+ * @pci: pointer to struct &edac_pci_ctl_info
+ */
+extern void edac_pci_release_generic_ctl(struct edac_pci_ctl_info *pci);
+
+/**
+ * edac_pci_create_sysfs
+ * Create the controls/attributes for the specified EDAC PCI device
+ *
+ * @pci: pointer to struct &edac_pci_ctl_info
+ */
+extern int edac_pci_create_sysfs(struct edac_pci_ctl_info *pci);
+
+/**
+ * edac_pci_remove_sysfs()
+ * remove the controls and attributes for this EDAC PCI device
+ *
+ * @pci: pointer to struct &edac_pci_ctl_info
+ */
+extern void edac_pci_remove_sysfs(struct edac_pci_ctl_info *pci);
+
+#endif
diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c
index 6e3428ba400f..72c9eb9fdffb 100644
--- a/drivers/edac/edac_pci_sysfs.c
+++ b/drivers/edac/edac_pci_sysfs.c
@@ -11,7 +11,7 @@
#include <linux/slab.h>
#include <linux/ctype.h>
-#include "edac_core.h"
+#include "edac_pci.h"
#include "edac_module.h"
#define EDAC_PCI_SYMLINK "device"
@@ -418,12 +418,6 @@ static void edac_pci_main_kobj_teardown(void)
}
}
-/*
- *
- * edac_pci_create_sysfs
- *
- * Create the controls/attributes for the specified EDAC PCI device
- */
int edac_pci_create_sysfs(struct edac_pci_ctl_info *pci)
{
int err;
@@ -459,11 +453,6 @@ unregister_cleanup:
return err;
}
-/*
- * edac_pci_remove_sysfs
- *
- * remove the controls and attributes for this EDAC PCI device
- */
void edac_pci_remove_sysfs(struct edac_pci_ctl_info *pci)
{
edac_dbg(0, "index=%d\n", pci->pci_idx);
diff --git a/drivers/edac/fsl_ddr_edac.c b/drivers/edac/fsl_ddr_edac.c
index 9774f52f0c3e..4e9608a958e7 100644
--- a/drivers/edac/fsl_ddr_edac.c
+++ b/drivers/edac/fsl_ddr_edac.c
@@ -28,7 +28,6 @@
#include <linux/of_device.h>
#include <linux/of_address.h>
#include "edac_module.h"
-#include "edac_core.h"
#include "fsl_ddr_edac.h"
#define EDAC_MOD_STR "fsl_ddr_edac"
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index e3fa4390f846..4e61a6229dd2 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -14,7 +14,7 @@
#include <acpi/ghes.h>
#include <linux/edac.h>
#include <linux/dmi.h>
-#include "edac_core.h"
+#include "edac_module.h"
#include <ras/ras_event.h>
#define GHES_EDAC_REVISION " Ver: 1.0.0"
diff --git a/drivers/edac/highbank_l2_edac.c b/drivers/edac/highbank_l2_edac.c
index 2f193668ebc7..cd9a2bb7c548 100644
--- a/drivers/edac/highbank_l2_edac.c
+++ b/drivers/edac/highbank_l2_edac.c
@@ -21,7 +21,6 @@
#include <linux/platform_device.h>
#include <linux/of_platform.h>
-#include "edac_core.h"
#include "edac_module.h"
#define SR_CLR_SB_ECC_INTR 0x0
diff --git a/drivers/edac/highbank_mc_edac.c b/drivers/edac/highbank_mc_edac.c
index 11260cc3360e..0e7e0a404d89 100644
--- a/drivers/edac/highbank_mc_edac.c
+++ b/drivers/edac/highbank_mc_edac.c
@@ -22,7 +22,6 @@
#include <linux/of_platform.h>
#include <linux/uaccess.h>
-#include "edac_core.h"
#include "edac_module.h"
/* DDR Ctrlr Error Registers */
diff --git a/drivers/edac/i3000_edac.c b/drivers/edac/i3000_edac.c
index 5cb36a6022cc..5306240570d7 100644
--- a/drivers/edac/i3000_edac.c
+++ b/drivers/edac/i3000_edac.c
@@ -14,7 +14,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define I3000_REVISION "1.1"
diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
index 1f453382258a..77c58d201a30 100644
--- a/drivers/edac/i3200_edac.c
+++ b/drivers/edac/i3200_edac.c
@@ -13,7 +13,7 @@
#include <linux/pci_ids.h>
#include <linux/edac.h>
#include <linux/io.h>
-#include "edac_core.h"
+#include "edac_module.h"
#include <linux/io-64-nonatomic-lo-hi.h>
diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
index 72e07e3cf718..1670d27bcac8 100644
--- a/drivers/edac/i5000_edac.c
+++ b/drivers/edac/i5000_edac.c
@@ -22,7 +22,7 @@
#include <linux/edac.h>
#include <asm/mmzone.h>
-#include "edac_core.h"
+#include "edac_module.h"
/*
* Alter this version for the I5000 module when modifications are made
diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
index c655162caf08..a8334c4acea7 100644
--- a/drivers/edac/i5100_edac.c
+++ b/drivers/edac/i5100_edac.c
@@ -29,7 +29,6 @@
#include <linux/mmzone.h>
#include <linux/debugfs.h>
-#include "edac_core.h"
#include "edac_module.h"
/* register addresses */
diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c
index 6ef6ad1ba16e..abf6ef22e220 100644
--- a/drivers/edac/i5400_edac.c
+++ b/drivers/edac/i5400_edac.c
@@ -32,7 +32,7 @@
#include <linux/edac.h>
#include <linux/mmzone.h>
-#include "edac_core.h"
+#include "edac_module.h"
/*
* Alter this version for the I5400 module when modifications are made
diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c
index dcac982fdc7a..0a912bf6de00 100644
--- a/drivers/edac/i7300_edac.c
+++ b/drivers/edac/i7300_edac.c
@@ -26,7 +26,7 @@
#include <linux/edac.h>
#include <linux/mmzone.h>
-#include "edac_core.h"
+#include "edac_module.h"
/*
* Alter this version for the I7300 module when modifications are made
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 8a68a5e943ea..69b5adead0ad 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -39,7 +39,7 @@
#include <asm/processor.h>
#include <asm/div64.h>
-#include "edac_core.h"
+#include "edac_module.h"
/* Static vars */
static LIST_HEAD(i7core_edac_list);
diff --git a/drivers/edac/i82443bxgx_edac.c b/drivers/edac/i82443bxgx_edac.c
index 4d4110364f02..cb61a5b7d080 100644
--- a/drivers/edac/i82443bxgx_edac.c
+++ b/drivers/edac/i82443bxgx_edac.c
@@ -29,7 +29,7 @@
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define I82443_REVISION "0.1"
diff --git a/drivers/edac/i82860_edac.c b/drivers/edac/i82860_edac.c
index ee1078cd3b96..236c813227fc 100644
--- a/drivers/edac/i82860_edac.c
+++ b/drivers/edac/i82860_edac.c
@@ -14,7 +14,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define I82860_REVISION " Ver: 2.0.2"
#define EDAC_MOD_STR "i82860_edac"
diff --git a/drivers/edac/i82875p_edac.c b/drivers/edac/i82875p_edac.c
index c26a513f8869..e286b7e74c7a 100644
--- a/drivers/edac/i82875p_edac.c
+++ b/drivers/edac/i82875p_edac.c
@@ -18,7 +18,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define I82875P_REVISION " Ver: 2.0.2"
#define EDAC_MOD_STR "i82875p_edac"
diff --git a/drivers/edac/i82975x_edac.c b/drivers/edac/i82975x_edac.c
index 35ab66c623a3..7baa8ace267b 100644
--- a/drivers/edac/i82975x_edac.c
+++ b/drivers/edac/i82975x_edac.c
@@ -14,7 +14,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define I82975X_REVISION " Ver: 1.0.0"
#define EDAC_MOD_STR "i82975x_edac"
diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
index 1c88d9707495..2733fb5938a4 100644
--- a/drivers/edac/ie31200_edac.c
+++ b/drivers/edac/ie31200_edac.c
@@ -41,7 +41,7 @@
#include <linux/edac.h>
#include <linux/io-64-nonatomic-lo-hi.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define IE31200_REVISION "1.0"
#define EDAC_MOD_STR "ie31200_edac"
diff --git a/drivers/edac/layerscape_edac.c b/drivers/edac/layerscape_edac.c
index 6c59d897ad12..94cac7686a56 100644
--- a/drivers/edac/layerscape_edac.c
+++ b/drivers/edac/layerscape_edac.c
@@ -16,7 +16,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include "edac_core.h"
+#include "edac_module.h"
#include "fsl_ddr_edac.h"
static const struct of_device_id fsl_ddr_mc_err_of_match[] = {
diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c
index c62602141f95..8f66cbed70b7 100644
--- a/drivers/edac/mpc85xx_edac.c
+++ b/drivers/edac/mpc85xx_edac.c
@@ -25,7 +25,6 @@
#include <linux/of_platform.h>
#include <linux/of_device.h>
#include "edac_module.h"
-#include "edac_core.h"
#include "mpc85xx_edac.h"
#include "fsl_ddr_edac.h"
diff --git a/drivers/edac/mv64x60_edac.c b/drivers/edac/mv64x60_edac.c
index cb9b8577acbc..14b7e7b71eaa 100644
--- a/drivers/edac/mv64x60_edac.c
+++ b/drivers/edac/mv64x60_edac.c
@@ -17,7 +17,6 @@
#include <linux/edac.h>
#include <linux/gfp.h>
-#include "edac_core.h"
#include "edac_module.h"
#include "mv64x60_edac.h"
diff --git a/drivers/edac/octeon_edac-l2c.c b/drivers/edac/octeon_edac-l2c.c
index afea7fc625cc..c33059e9b0be 100644
--- a/drivers/edac/octeon_edac-l2c.c
+++ b/drivers/edac/octeon_edac-l2c.c
@@ -16,7 +16,6 @@
#include <asm/octeon/cvmx.h>
-#include "edac_core.h"
#include "edac_module.h"
#define EDAC_MOD_STR "octeon-l2c"
diff --git a/drivers/edac/octeon_edac-lmc.c b/drivers/edac/octeon_edac-lmc.c
index cda6dab5067a..9c1ffe3e912b 100644
--- a/drivers/edac/octeon_edac-lmc.c
+++ b/drivers/edac/octeon_edac-lmc.c
@@ -19,7 +19,6 @@
#include <asm/octeon/octeon.h>
#include <asm/octeon/cvmx-lmcx-defs.h>
-#include "edac_core.h"
#include "edac_module.h"
#define OCTEON_MAX_MC 4
diff --git a/drivers/edac/octeon_edac-pc.c b/drivers/edac/octeon_edac-pc.c
index 2ab6cf24c959..754eced59c32 100644
--- a/drivers/edac/octeon_edac-pc.c
+++ b/drivers/edac/octeon_edac-pc.c
@@ -15,7 +15,6 @@
#include <linux/io.h>
#include <linux/edac.h>
-#include "edac_core.h"
#include "edac_module.h"
#include <asm/octeon/cvmx.h>
diff --git a/drivers/edac/octeon_edac-pci.c b/drivers/edac/octeon_edac-pci.c
index 9ca73cec74e7..28b238eecefc 100644
--- a/drivers/edac/octeon_edac-pci.c
+++ b/drivers/edac/octeon_edac-pci.c
@@ -18,7 +18,6 @@
#include <asm/octeon/cvmx-pci-defs.h>
#include <asm/octeon/octeon.h>
-#include "edac_core.h"
#include "edac_module.h"
static void octeon_pci_poll(struct edac_pci_ctl_info *pci)
diff --git a/drivers/edac/pasemi_edac.c b/drivers/edac/pasemi_edac.c
index 9c971b575530..199f2c80480d 100644
--- a/drivers/edac/pasemi_edac.c
+++ b/drivers/edac/pasemi_edac.c
@@ -26,7 +26,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define MODULE_NAME "pasemi_edac"
diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c
index 691ce25e9010..e55e92590106 100644
--- a/drivers/edac/ppc4xx_edac.c
+++ b/drivers/edac/ppc4xx_edac.c
@@ -21,7 +21,7 @@
#include <asm/dcr.h>
-#include "edac_core.h"
+#include "edac_module.h"
#include "ppc4xx_edac.h"
/*
diff --git a/drivers/edac/r82600_edac.c b/drivers/edac/r82600_edac.c
index 8f936bc7a010..978916625ced 100644
--- a/drivers/edac/r82600_edac.c
+++ b/drivers/edac/r82600_edac.c
@@ -20,7 +20,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/edac.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define R82600_REVISION " Ver: 2.0.2"
#define EDAC_MOD_STR "r82600_edac"
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index c1ad0eb7d5dd..54ae6dc45ab2 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -27,7 +27,7 @@
#include <asm/processor.h>
#include <asm/mce.h>
-#include "edac_core.h"
+#include "edac_module.h"
/* Static vars */
static LIST_HEAD(sbridge_edac_list);
diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c
index 9edcb29b3001..79ef675e4d6f 100644
--- a/drivers/edac/skx_edac.c
+++ b/drivers/edac/skx_edac.c
@@ -29,7 +29,7 @@
#include <asm/processor.h>
#include <asm/mce.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define SKX_REVISION " Ver: 1.0 "
diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c
index fc153aea2f6c..1c01dec78ec3 100644
--- a/drivers/edac/synopsys_edac.c
+++ b/drivers/edac/synopsys_edac.c
@@ -23,7 +23,7 @@
#include <linux/module.h>
#include <linux/platform_device.h>
-#include "edac_core.h"
+#include "edac_module.h"
/* Number of cs_rows needed per memory controller */
#define SYNPS_EDAC_NR_CSROWS 1
diff --git a/drivers/edac/tile_edac.c b/drivers/edac/tile_edac.c
index 71381642ce2a..8a33a87e67f1 100644
--- a/drivers/edac/tile_edac.c
+++ b/drivers/edac/tile_edac.c
@@ -30,7 +30,7 @@
#include <hv/hypervisor.h>
#include <hv/drv_mshim_intf.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define DRV_NAME "tile-edac"
diff --git a/drivers/edac/x38_edac.c b/drivers/edac/x38_edac.c
index 314cf5cf268c..03c97a4bf590 100644
--- a/drivers/edac/x38_edac.c
+++ b/drivers/edac/x38_edac.c
@@ -16,7 +16,7 @@
#include <linux/edac.h>
#include <linux/io-64-nonatomic-lo-hi.h>
-#include "edac_core.h"
+#include "edac_module.h"
#define X38_REVISION "1.1"
diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c
index 5569391ea800..6c270d9d304a 100644
--- a/drivers/edac/xgene_edac.c
+++ b/drivers/edac/xgene_edac.c
@@ -28,7 +28,6 @@
#include <linux/of_address.h>
#include <linux/regmap.h>
-#include "edac_core.h"
#include "edac_module.h"
#define EDAC_MOD_STR "xgene_edac"
diff --git a/include/linux/edac.h b/include/linux/edac.h
index cb56dcba68c6..07c52c0af62d 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -18,6 +18,8 @@
#include <linux/workqueue.h>
#include <linux/debugfs.h>
+#define EDAC_DEVICE_NAME_LEN 31
+
struct device;
#define EDAC_OPSTATE_INVAL -1
@@ -128,8 +130,16 @@ enum dev_type {
* fatal (maybe it is on an unused memory area,
* or the memory controller could recover from
* it for example, by re-trying the operation).
+ * @HW_EVENT_ERR_DEFERRED: Deferred Error - Indicates an uncorrectable
+ * error whose handling is not urgent. This could
+ * be due to hardware data poisoning where the
+ * system can continue operation until the poisoned
+ * data is consumed. Preemptive measures may also
+ * be taken, e.g. offlining pages, etc.
* @HW_EVENT_ERR_FATAL: Fatal Error - Uncorrected error that could not
* be recovered.
+ * @HW_EVENT_ERR_INFO: Informational - The CPER spec defines a forth
+ * type of error: informational logs.
*/
enum hw_event_mc_err_type {
HW_EVENT_ERR_CORRECTED,
@@ -160,7 +170,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
* enum mem_type - memory types. For a more detailed reference, please see
* http://en.wikipedia.org/wiki/DRAM
*
- * @MEM_EMPTY Empty csrow
+ * @MEM_EMPTY: Empty csrow
* @MEM_RESERVED: Reserved csrow type
* @MEM_UNKNOWN: Unknown csrow type
* @MEM_FPM: FPM - Fast Page Mode, used on systems up to 1995.
@@ -284,7 +294,7 @@ enum edac_type {
/**
* enum scrub_type - scrubbing capabilities
- * @SCRUB_UNKNOWN Unknown if scrubber is available
+ * @SCRUB_UNKNOWN: Unknown if scrubber is available
* @SCRUB_NONE: No scrubber
* @SCRUB_SW_PROG: SW progressive (sequential) scrubbing
* @SCRUB_SW_SRC: Software scrub only errors
@@ -293,7 +303,7 @@ enum edac_type {
* @SCRUB_HW_PROG: HW progressive (sequential) scrubbing
* @SCRUB_HW_SRC: Hardware scrub only errors
* @SCRUB_HW_PROG_SRC: Progressive hardware scrub from an error
- * SCRUB_HW_TUNABLE: Hardware scrub frequency is tunable
+ * @SCRUB_HW_TUNABLE: Hardware scrub frequency is tunable
*/
enum scrub_type {
SCRUB_UNKNOWN = 0,
@@ -326,114 +336,6 @@ enum scrub_type {
#define OP_RUNNING_POLL_INTR 0x203
#define OP_OFFLINE 0x300
-/*
- * Concepts used at the EDAC subsystem
- *
- * There are several things to be aware of that aren't at all obvious:
- *
- * SOCKETS, SOCKET SETS, BANKS, ROWS, CHIP-SELECT ROWS, CHANNELS, etc..
- *
- * These are some of the many terms that are thrown about that don't always
- * mean what people think they mean (Inconceivable!). In the interest of
- * creating a common ground for discussion, terms and their definitions
- * will be established.
- *
- * Memory devices: The individual DRAM chips on a memory stick. These
- * devices commonly output 4 and 8 bits each (x4, x8).
- * Grouping several of these in parallel provides the
- * number of bits that the memory controller expects:
- * typically 72 bits, in order to provide 64 bits +
- * 8 bits of ECC data.
- *
- * Memory Stick: A printed circuit board that aggregates multiple
- * memory devices in parallel. In general, this is the
- * Field Replaceable Unit (FRU) which gets replaced, in
- * the case of excessive errors. Most often it is also
- * called DIMM (Dual Inline Memory Module).
- *
- * Memory Socket: A physical connector on the motherboard that accepts
- * a single memory stick. Also called as "slot" on several
- * datasheets.
- *
- * Channel: A memory controller channel, responsible to communicate
- * with a group of DIMMs. Each channel has its own
- * independent control (command) and data bus, and can
- * be used independently or grouped with other channels.
- *
- * Branch: It is typically the highest hierarchy on a
- * Fully-Buffered DIMM memory controller.
- * Typically, it contains two channels.
- * Two channels at the same branch can be used in single
- * mode or in lockstep mode.
- * When lockstep is enabled, the cacheline is doubled,
- * but it generally brings some performance penalty.
- * Also, it is generally not possible to point to just one
- * memory stick when an error occurs, as the error
- * correction code is calculated using two DIMMs instead
- * of one. Due to that, it is capable of correcting more
- * errors than on single mode.
- *
- * Single-channel: The data accessed by the memory controller is contained
- * into one dimm only. E. g. if the data is 64 bits-wide,
- * the data flows to the CPU using one 64 bits parallel
- * access.
- * Typically used with SDR, DDR, DDR2 and DDR3 memories.
- * FB-DIMM and RAMBUS use a different concept for channel,
- * so this concept doesn't apply there.
- *
- * Double-channel: The data size accessed by the memory controller is
- * interlaced into two dimms, accessed at the same time.
- * E. g. if the DIMM is 64 bits-wide (72 bits with ECC),
- * the data flows to the CPU using a 128 bits parallel
- * access.
- *
- * Chip-select row: This is the name of the DRAM signal used to select the
- * DRAM ranks to be accessed. Common chip-select rows for
- * single channel are 64 bits, for dual channel 128 bits.
- * It may not be visible by the memory controller, as some
- * DIMM types have a memory buffer that can hide direct
- * access to it from the Memory Controller.
- *
- * Single-Ranked stick: A Single-ranked stick has 1 chip-select row of memory.
- * Motherboards commonly drive two chip-select pins to
- * a memory stick. A single-ranked stick, will occupy
- * only one of those rows. The other will be unused.
- *
- * Double-Ranked stick: A double-ranked stick has two chip-select rows which
- * access different sets of memory devices. The two
- * rows cannot be accessed concurrently.
- *
- * Double-sided stick: DEPRECATED TERM, see Double-Ranked stick.
- * A double-sided stick has two chip-select rows which
- * access different sets of memory devices. The two
- * rows cannot be accessed concurrently. "Double-sided"
- * is irrespective of the memory devices being mounted
- * on both sides of the memory stick.
- *
- * Socket set: All of the memory sticks that are required for
- * a single memory access or all of the memory sticks
- * spanned by a chip-select row. A single socket set
- * has two chip-select rows and if double-sided sticks
- * are used these will occupy those chip-select rows.
- *
- * Bank: This term is avoided because it is unclear when
- * needing to distinguish between chip-select rows and
- * socket sets.
- *
- * Controller pages:
- *
- * Physical pages:
- *
- * Virtual pages:
- *
- *
- * STRUCTURE ORGANIZATION AND CHOICES
- *
- *
- *
- * PS - I enjoyed writing all that about as much as you enjoyed reading it.
- */
-
/**
* enum edac_mc_layer - memory controller hierarchy layer
*
@@ -458,7 +360,7 @@ enum edac_mc_layer_type {
/**
* struct edac_mc_layer - describes the memory controller hierarchy
- * @layer: layer type
+ * @type: layer type
* @size: number of components per layer. For example,
* if the channel layer has two channels, size = 2
* @is_virt_csrow: This layer is part of the "csrow" when old API
@@ -481,24 +383,28 @@ struct edac_mc_layer {
#define EDAC_MAX_LAYERS 3
/**
- * EDAC_DIMM_OFF - Macro responsible to get a pointer offset inside a pointer array
- * for the element given by [layer0,layer1,layer2] position
+ * EDAC_DIMM_OFF - Macro responsible to get a pointer offset inside a pointer
+ * array for the element given by [layer0,layer1,layer2]
+ * position
*
* @layers: a struct edac_mc_layer array, describing how many elements
* were allocated for each layer
- * @n_layers: Number of layers at the @layers array
+ * @nlayers: Number of layers at the @layers array
* @layer0: layer0 position
* @layer1: layer1 position. Unused if n_layers < 2
* @layer2: layer2 position. Unused if n_layers < 3
*
- * For 1 layer, this macro returns &var[layer0] - &var
+ * For 1 layer, this macro returns "var[layer0] - var";
+ *
* For 2 layers, this macro is similar to allocate a bi-dimensional array
- * and to return "&var[layer0][layer1] - &var"
+ * and to return "var[layer0][layer1] - var";
+ *
* For 3 layers, this macro is similar to allocate a tri-dimensional array
- * and to return "&var[layer0][layer1][layer2] - &var"
+ * and to return "var[layer0][layer1][layer2] - var".
*
* A loop could be used here to make it more generic, but, as we only have
* 3 layers, this is a little faster.
+ *
* By design, layers can never be 0 or more than 3. If that ever happens,
* a NULL is returned, causing an OOPS during the memory allocation routine,
* with would point to the developer that he's doing something wrong.
@@ -525,16 +431,18 @@ struct edac_mc_layer {
* were allocated for each layer
* @var: name of the var where we want to get the pointer
* (like mci->dimms)
- * @n_layers: Number of layers at the @layers array
+ * @nlayers: Number of layers at the @layers array
* @layer0: layer0 position
* @layer1: layer1 position. Unused if n_layers < 2
* @layer2: layer2 position. Unused if n_layers < 3
*
- * For 1 layer, this macro returns &var[layer0]
+ * For 1 layer, this macro returns "var[layer0]";
+ *
* For 2 layers, this macro is similar to allocate a bi-dimensional array
- * and to return "&var[layer0][layer1]"
+ * and to return "var[layer0][layer1]";
+ *
* For 3 layers, this macro is similar to allocate a tri-dimensional array
- * and to return "&var[layer0][layer1][layer2]"
+ * and to return "var[layer0][layer1][layer2]";
*/
#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \
typeof(*var) __p; \
@@ -620,7 +528,7 @@ struct errcount_attribute_data {
};
/**
- * edac_raw_error_desc - Raw error report structure
+ * struct edac_raw_error_desc - Raw error report structure
* @grain: minimum granularity for an error report, in bytes
* @error_count: number of errors of the same type
* @top_layer: top layer of the error (layer[0])