sb_edac: allow different interleave lists
[deliverable/linux.git] / drivers / edac / sb_edac.c
1 /* Intel Sandy Bridge -EN/-EP/-EX Memory Controller kernel module
2 *
3 * This driver supports the memory controllers found on the Intel
4 * processor family Sandy Bridge.
5 *
6 * This file may be distributed under the terms of the
7 * GNU General Public License version 2 only.
8 *
9 * Copyright (c) 2011 by:
10 * Mauro Carvalho Chehab <mchehab@redhat.com>
11 */
12
13 #include <linux/module.h>
14 #include <linux/init.h>
15 #include <linux/pci.h>
16 #include <linux/pci_ids.h>
17 #include <linux/slab.h>
18 #include <linux/delay.h>
19 #include <linux/edac.h>
20 #include <linux/mmzone.h>
21 #include <linux/smp.h>
22 #include <linux/bitmap.h>
23 #include <linux/math64.h>
24 #include <asm/processor.h>
25 #include <asm/mce.h>
26
27 #include "edac_core.h"
28
29 /* Static vars */
30 static LIST_HEAD(sbridge_edac_list);
31 static DEFINE_MUTEX(sbridge_edac_lock);
32 static int probed;
33
34 /*
35 * Alter this version for the module when modifications are made
36 */
37 #define SBRIDGE_REVISION " Ver: 1.0.0 "
38 #define EDAC_MOD_STR "sbridge_edac"
39
40 /*
41 * Debug macros
42 */
43 #define sbridge_printk(level, fmt, arg...) \
44 edac_printk(level, "sbridge", fmt, ##arg)
45
46 #define sbridge_mc_printk(mci, level, fmt, arg...) \
47 edac_mc_chipset_printk(mci, level, "sbridge", fmt, ##arg)
48
49 /*
50 * Get a bit field at register value <v>, from bit <lo> to bit <hi>
51 */
52 #define GET_BITFIELD(v, lo, hi) \
53 (((v) & ((1ULL << ((hi) - (lo) + 1)) - 1) << (lo)) >> (lo))
54
55 /*
56 * sbridge Memory Controller Registers
57 */
58
59 /*
60 * FIXME: For now, let's order by device function, as it makes
61 * easier for driver's development process. This table should be
62 * moved to pci_id.h when submitted upstream
63 */
64 #define PCI_DEVICE_ID_INTEL_SBRIDGE_SAD0 0x3cf4 /* 12.6 */
65 #define PCI_DEVICE_ID_INTEL_SBRIDGE_SAD1 0x3cf6 /* 12.7 */
66 #define PCI_DEVICE_ID_INTEL_SBRIDGE_BR 0x3cf5 /* 13.6 */
67 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0 0x3ca0 /* 14.0 */
68 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA 0x3ca8 /* 15.0 */
69 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_RAS 0x3c71 /* 15.1 */
70 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD0 0x3caa /* 15.2 */
71 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD1 0x3cab /* 15.3 */
72 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD2 0x3cac /* 15.4 */
73 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD3 0x3cad /* 15.5 */
74 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_DDRIO 0x3cb8 /* 17.0 */
75
76 /*
77 * Currently, unused, but will be needed in the future
78 * implementations, as they hold the error counters
79 */
80 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_ERR0 0x3c72 /* 16.2 */
81 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_ERR1 0x3c73 /* 16.3 */
82 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_ERR2 0x3c76 /* 16.6 */
83 #define PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_ERR3 0x3c77 /* 16.7 */
84
85 /* Devices 12 Function 6, Offsets 0x80 to 0xcc */
86 static const u32 sbridge_dram_rule[] = {
87 0x80, 0x88, 0x90, 0x98, 0xa0,
88 0xa8, 0xb0, 0xb8, 0xc0, 0xc8,
89 };
90
91 #define SAD_LIMIT(reg) ((GET_BITFIELD(reg, 6, 25) << 26) | 0x3ffffff)
92 #define DRAM_ATTR(reg) GET_BITFIELD(reg, 2, 3)
93 #define INTERLEAVE_MODE(reg) GET_BITFIELD(reg, 1, 1)
94 #define DRAM_RULE_ENABLE(reg) GET_BITFIELD(reg, 0, 0)
95
96 static char *get_dram_attr(u32 reg)
97 {
98 switch(DRAM_ATTR(reg)) {
99 case 0:
100 return "DRAM";
101 case 1:
102 return "MMCFG";
103 case 2:
104 return "NXM";
105 default:
106 return "unknown";
107 }
108 }
109
110 static const u32 sbridge_interleave_list[] = {
111 0x84, 0x8c, 0x94, 0x9c, 0xa4,
112 0xac, 0xb4, 0xbc, 0xc4, 0xcc,
113 };
114
115 #define SAD_PKG0(reg) GET_BITFIELD(reg, 0, 2)
116 #define SAD_PKG1(reg) GET_BITFIELD(reg, 3, 5)
117 #define SAD_PKG2(reg) GET_BITFIELD(reg, 8, 10)
118 #define SAD_PKG3(reg) GET_BITFIELD(reg, 11, 13)
119 #define SAD_PKG4(reg) GET_BITFIELD(reg, 16, 18)
120 #define SAD_PKG5(reg) GET_BITFIELD(reg, 19, 21)
121 #define SAD_PKG6(reg) GET_BITFIELD(reg, 24, 26)
122 #define SAD_PKG7(reg) GET_BITFIELD(reg, 27, 29)
123
124 static inline int sad_pkg(u32 reg, int interleave)
125 {
126 switch (interleave) {
127 case 0:
128 return SAD_PKG0(reg);
129 case 1:
130 return SAD_PKG1(reg);
131 case 2:
132 return SAD_PKG2(reg);
133 case 3:
134 return SAD_PKG3(reg);
135 case 4:
136 return SAD_PKG4(reg);
137 case 5:
138 return SAD_PKG5(reg);
139 case 6:
140 return SAD_PKG6(reg);
141 case 7:
142 return SAD_PKG7(reg);
143 default:
144 return -EINVAL;
145 }
146 }
147
148 /* Devices 12 Function 7 */
149
150 #define TOLM 0x80
151 #define TOHM 0x84
152
153 #define GET_TOLM(reg) ((GET_BITFIELD(reg, 0, 3) << 28) | 0x3ffffff)
154 #define GET_TOHM(reg) ((GET_BITFIELD(reg, 0, 20) << 25) | 0x3ffffff)
155
156 /* Device 13 Function 6 */
157
158 #define SAD_TARGET 0xf0
159
160 #define SOURCE_ID(reg) GET_BITFIELD(reg, 9, 11)
161
162 #define SAD_CONTROL 0xf4
163
164 #define NODE_ID(reg) GET_BITFIELD(reg, 0, 2)
165
166 /* Device 14 function 0 */
167
168 static const u32 tad_dram_rule[] = {
169 0x40, 0x44, 0x48, 0x4c,
170 0x50, 0x54, 0x58, 0x5c,
171 0x60, 0x64, 0x68, 0x6c,
172 };
173 #define MAX_TAD ARRAY_SIZE(tad_dram_rule)
174
175 #define TAD_LIMIT(reg) ((GET_BITFIELD(reg, 12, 31) << 26) | 0x3ffffff)
176 #define TAD_SOCK(reg) GET_BITFIELD(reg, 10, 11)
177 #define TAD_CH(reg) GET_BITFIELD(reg, 8, 9)
178 #define TAD_TGT3(reg) GET_BITFIELD(reg, 6, 7)
179 #define TAD_TGT2(reg) GET_BITFIELD(reg, 4, 5)
180 #define TAD_TGT1(reg) GET_BITFIELD(reg, 2, 3)
181 #define TAD_TGT0(reg) GET_BITFIELD(reg, 0, 1)
182
183 /* Device 15, function 0 */
184
185 #define MCMTR 0x7c
186
187 #define IS_ECC_ENABLED(mcmtr) GET_BITFIELD(mcmtr, 2, 2)
188 #define IS_LOCKSTEP_ENABLED(mcmtr) GET_BITFIELD(mcmtr, 1, 1)
189 #define IS_CLOSE_PG(mcmtr) GET_BITFIELD(mcmtr, 0, 0)
190
191 /* Device 15, function 1 */
192
193 #define RASENABLES 0xac
194 #define IS_MIRROR_ENABLED(reg) GET_BITFIELD(reg, 0, 0)
195
196 /* Device 15, functions 2-5 */
197
198 static const int mtr_regs[] = {
199 0x80, 0x84, 0x88,
200 };
201
202 #define RANK_DISABLE(mtr) GET_BITFIELD(mtr, 16, 19)
203 #define IS_DIMM_PRESENT(mtr) GET_BITFIELD(mtr, 14, 14)
204 #define RANK_CNT_BITS(mtr) GET_BITFIELD(mtr, 12, 13)
205 #define RANK_WIDTH_BITS(mtr) GET_BITFIELD(mtr, 2, 4)
206 #define COL_WIDTH_BITS(mtr) GET_BITFIELD(mtr, 0, 1)
207
208 static const u32 tad_ch_nilv_offset[] = {
209 0x90, 0x94, 0x98, 0x9c,
210 0xa0, 0xa4, 0xa8, 0xac,
211 0xb0, 0xb4, 0xb8, 0xbc,
212 };
213 #define CHN_IDX_OFFSET(reg) GET_BITFIELD(reg, 28, 29)
214 #define TAD_OFFSET(reg) (GET_BITFIELD(reg, 6, 25) << 26)
215
216 static const u32 rir_way_limit[] = {
217 0x108, 0x10c, 0x110, 0x114, 0x118,
218 };
219 #define MAX_RIR_RANGES ARRAY_SIZE(rir_way_limit)
220
221 #define IS_RIR_VALID(reg) GET_BITFIELD(reg, 31, 31)
222 #define RIR_WAY(reg) GET_BITFIELD(reg, 28, 29)
223 #define RIR_LIMIT(reg) ((GET_BITFIELD(reg, 1, 10) << 29)| 0x1fffffff)
224
225 #define MAX_RIR_WAY 8
226
227 static const u32 rir_offset[MAX_RIR_RANGES][MAX_RIR_WAY] = {
228 { 0x120, 0x124, 0x128, 0x12c, 0x130, 0x134, 0x138, 0x13c },
229 { 0x140, 0x144, 0x148, 0x14c, 0x150, 0x154, 0x158, 0x15c },
230 { 0x160, 0x164, 0x168, 0x16c, 0x170, 0x174, 0x178, 0x17c },
231 { 0x180, 0x184, 0x188, 0x18c, 0x190, 0x194, 0x198, 0x19c },
232 { 0x1a0, 0x1a4, 0x1a8, 0x1ac, 0x1b0, 0x1b4, 0x1b8, 0x1bc },
233 };
234
235 #define RIR_RNK_TGT(reg) GET_BITFIELD(reg, 16, 19)
236 #define RIR_OFFSET(reg) GET_BITFIELD(reg, 2, 14)
237
238 /* Device 16, functions 2-7 */
239
240 /*
241 * FIXME: Implement the error count reads directly
242 */
243
244 static const u32 correrrcnt[] = {
245 0x104, 0x108, 0x10c, 0x110,
246 };
247
248 #define RANK_ODD_OV(reg) GET_BITFIELD(reg, 31, 31)
249 #define RANK_ODD_ERR_CNT(reg) GET_BITFIELD(reg, 16, 30)
250 #define RANK_EVEN_OV(reg) GET_BITFIELD(reg, 15, 15)
251 #define RANK_EVEN_ERR_CNT(reg) GET_BITFIELD(reg, 0, 14)
252
253 static const u32 correrrthrsld[] = {
254 0x11c, 0x120, 0x124, 0x128,
255 };
256
257 #define RANK_ODD_ERR_THRSLD(reg) GET_BITFIELD(reg, 16, 30)
258 #define RANK_EVEN_ERR_THRSLD(reg) GET_BITFIELD(reg, 0, 14)
259
260
261 /* Device 17, function 0 */
262
263 #define SB_RANK_CFG_A 0x0328
264
265 #define IS_RDIMM_ENABLED(reg) GET_BITFIELD(reg, 11, 11)
266
267 /*
268 * sbridge structs
269 */
270
271 #define NUM_CHANNELS 4
272 #define MAX_DIMMS 3 /* Max DIMMS per channel */
273
274 struct sbridge_pvt;
275 struct sbridge_info {
276 u32 mcmtr;
277 u32 rankcfgr;
278 u64 (*get_tolm)(struct sbridge_pvt *pvt);
279 u64 (*get_tohm)(struct sbridge_pvt *pvt);
280 const u32 *dram_rule;
281 const u32 *interleave_list;
282 u8 max_sad;
283 u8 max_interleave;
284 };
285
286 struct sbridge_channel {
287 u32 ranks;
288 u32 dimms;
289 };
290
291 struct pci_id_descr {
292 int dev;
293 int func;
294 int dev_id;
295 int optional;
296 };
297
298 struct pci_id_table {
299 const struct pci_id_descr *descr;
300 int n_devs;
301 };
302
303 struct sbridge_dev {
304 struct list_head list;
305 u8 bus, mc;
306 u8 node_id, source_id;
307 struct pci_dev **pdev;
308 int n_devs;
309 struct mem_ctl_info *mci;
310 };
311
312 struct sbridge_pvt {
313 struct pci_dev *pci_ta, *pci_ddrio, *pci_ras;
314 struct pci_dev *pci_sad0, *pci_sad1, *pci_ha0;
315 struct pci_dev *pci_br0;
316 struct pci_dev *pci_tad[NUM_CHANNELS];
317
318 struct sbridge_dev *sbridge_dev;
319
320 struct sbridge_info info;
321 struct sbridge_channel channel[NUM_CHANNELS];
322
323 /* Memory type detection */
324 bool is_mirrored, is_lockstep, is_close_pg;
325
326 /* Fifo double buffers */
327 struct mce mce_entry[MCE_LOG_LEN];
328 struct mce mce_outentry[MCE_LOG_LEN];
329
330 /* Fifo in/out counters */
331 unsigned mce_in, mce_out;
332
333 /* Count indicator to show errors not got */
334 unsigned mce_overrun;
335
336 /* Memory description */
337 u64 tolm, tohm;
338 };
339
340 #define PCI_DESCR(device, function, device_id, opt) \
341 .dev = (device), \
342 .func = (function), \
343 .dev_id = (device_id), \
344 .optional = opt
345
346 static const struct pci_id_descr pci_dev_descr_sbridge[] = {
347 /* Processor Home Agent */
348 { PCI_DESCR(14, 0, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0, 0) },
349
350 /* Memory controller */
351 { PCI_DESCR(15, 0, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA, 0) },
352 { PCI_DESCR(15, 1, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_RAS, 0) },
353 { PCI_DESCR(15, 2, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD0, 0) },
354 { PCI_DESCR(15, 3, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD1, 0) },
355 { PCI_DESCR(15, 4, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD2, 0) },
356 { PCI_DESCR(15, 5, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD3, 0) },
357 { PCI_DESCR(17, 0, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_DDRIO, 1) },
358
359 /* System Address Decoder */
360 { PCI_DESCR(12, 6, PCI_DEVICE_ID_INTEL_SBRIDGE_SAD0, 0) },
361 { PCI_DESCR(12, 7, PCI_DEVICE_ID_INTEL_SBRIDGE_SAD1, 0) },
362
363 /* Broadcast Registers */
364 { PCI_DESCR(13, 6, PCI_DEVICE_ID_INTEL_SBRIDGE_BR, 0) },
365 };
366
367 #define PCI_ID_TABLE_ENTRY(A) { .descr=A, .n_devs = ARRAY_SIZE(A) }
368 static const struct pci_id_table pci_dev_descr_sbridge_table[] = {
369 PCI_ID_TABLE_ENTRY(pci_dev_descr_sbridge),
370 {0,} /* 0 terminated list. */
371 };
372
373 /*
374 * pci_device_id table for which devices we are looking for
375 */
376 static DEFINE_PCI_DEVICE_TABLE(sbridge_pci_tbl) = {
377 {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA)},
378 {0,} /* 0 terminated list. */
379 };
380
381
382 /****************************************************************************
383 Ancillary status routines
384 ****************************************************************************/
385
386 static inline int numrank(u32 mtr)
387 {
388 int ranks = (1 << RANK_CNT_BITS(mtr));
389
390 if (ranks > 4) {
391 edac_dbg(0, "Invalid number of ranks: %d (max = 4) raw value = %x (%04x)\n",
392 ranks, (unsigned int)RANK_CNT_BITS(mtr), mtr);
393 return -EINVAL;
394 }
395
396 return ranks;
397 }
398
399 static inline int numrow(u32 mtr)
400 {
401 int rows = (RANK_WIDTH_BITS(mtr) + 12);
402
403 if (rows < 13 || rows > 18) {
404 edac_dbg(0, "Invalid number of rows: %d (should be between 14 and 17) raw value = %x (%04x)\n",
405 rows, (unsigned int)RANK_WIDTH_BITS(mtr), mtr);
406 return -EINVAL;
407 }
408
409 return 1 << rows;
410 }
411
412 static inline int numcol(u32 mtr)
413 {
414 int cols = (COL_WIDTH_BITS(mtr) + 10);
415
416 if (cols > 12) {
417 edac_dbg(0, "Invalid number of cols: %d (max = 4) raw value = %x (%04x)\n",
418 cols, (unsigned int)COL_WIDTH_BITS(mtr), mtr);
419 return -EINVAL;
420 }
421
422 return 1 << cols;
423 }
424
425 static struct sbridge_dev *get_sbridge_dev(u8 bus)
426 {
427 struct sbridge_dev *sbridge_dev;
428
429 list_for_each_entry(sbridge_dev, &sbridge_edac_list, list) {
430 if (sbridge_dev->bus == bus)
431 return sbridge_dev;
432 }
433
434 return NULL;
435 }
436
437 static struct sbridge_dev *alloc_sbridge_dev(u8 bus,
438 const struct pci_id_table *table)
439 {
440 struct sbridge_dev *sbridge_dev;
441
442 sbridge_dev = kzalloc(sizeof(*sbridge_dev), GFP_KERNEL);
443 if (!sbridge_dev)
444 return NULL;
445
446 sbridge_dev->pdev = kzalloc(sizeof(*sbridge_dev->pdev) * table->n_devs,
447 GFP_KERNEL);
448 if (!sbridge_dev->pdev) {
449 kfree(sbridge_dev);
450 return NULL;
451 }
452
453 sbridge_dev->bus = bus;
454 sbridge_dev->n_devs = table->n_devs;
455 list_add_tail(&sbridge_dev->list, &sbridge_edac_list);
456
457 return sbridge_dev;
458 }
459
460 static void free_sbridge_dev(struct sbridge_dev *sbridge_dev)
461 {
462 list_del(&sbridge_dev->list);
463 kfree(sbridge_dev->pdev);
464 kfree(sbridge_dev);
465 }
466
467 static u64 sbridge_get_tolm(struct sbridge_pvt *pvt)
468 {
469 u32 reg;
470
471 /* Address range is 32:28 */
472 pci_read_config_dword(pvt->pci_sad1, TOLM, &reg);
473 return GET_TOLM(reg);
474 }
475
476 static u64 sbridge_get_tohm(struct sbridge_pvt *pvt)
477 {
478 u32 reg;
479
480 pci_read_config_dword(pvt->pci_sad1, TOHM, &reg);
481 return GET_TOHM(reg);
482 }
483
484 /****************************************************************************
485 Memory check routines
486 ****************************************************************************/
487 static struct pci_dev *get_pdev_slot_func(u8 bus, unsigned slot,
488 unsigned func)
489 {
490 struct sbridge_dev *sbridge_dev = get_sbridge_dev(bus);
491 int i;
492
493 if (!sbridge_dev)
494 return NULL;
495
496 for (i = 0; i < sbridge_dev->n_devs; i++) {
497 if (!sbridge_dev->pdev[i])
498 continue;
499
500 if (PCI_SLOT(sbridge_dev->pdev[i]->devfn) == slot &&
501 PCI_FUNC(sbridge_dev->pdev[i]->devfn) == func) {
502 edac_dbg(1, "Associated %02x.%02x.%d with %p\n",
503 bus, slot, func, sbridge_dev->pdev[i]);
504 return sbridge_dev->pdev[i];
505 }
506 }
507
508 return NULL;
509 }
510
511 /**
512 * check_if_ecc_is_active() - Checks if ECC is active
513 * bus: Device bus
514 */
515 static int check_if_ecc_is_active(const u8 bus)
516 {
517 struct pci_dev *pdev = NULL;
518 u32 mcmtr;
519
520 pdev = get_pdev_slot_func(bus, 15, 0);
521 if (!pdev) {
522 sbridge_printk(KERN_ERR, "Couldn't find PCI device "
523 "%2x.%02d.%d!!!\n",
524 bus, 15, 0);
525 return -ENODEV;
526 }
527
528 pci_read_config_dword(pdev, MCMTR, &mcmtr);
529 if (!IS_ECC_ENABLED(mcmtr)) {
530 sbridge_printk(KERN_ERR, "ECC is disabled. Aborting\n");
531 return -ENODEV;
532 }
533 return 0;
534 }
535
536 static int get_dimm_config(struct mem_ctl_info *mci)
537 {
538 struct sbridge_pvt *pvt = mci->pvt_info;
539 struct dimm_info *dimm;
540 unsigned i, j, banks, ranks, rows, cols, npages;
541 u64 size;
542 u32 reg;
543 enum edac_type mode;
544 enum mem_type mtype;
545
546 pvt->info.rankcfgr = SB_RANK_CFG_A;
547
548 pci_read_config_dword(pvt->pci_br0, SAD_TARGET, &reg);
549 pvt->sbridge_dev->source_id = SOURCE_ID(reg);
550
551 pci_read_config_dword(pvt->pci_br0, SAD_CONTROL, &reg);
552 pvt->sbridge_dev->node_id = NODE_ID(reg);
553 edac_dbg(0, "mc#%d: Node ID: %d, source ID: %d\n",
554 pvt->sbridge_dev->mc,
555 pvt->sbridge_dev->node_id,
556 pvt->sbridge_dev->source_id);
557
558 pci_read_config_dword(pvt->pci_ras, RASENABLES, &reg);
559 if (IS_MIRROR_ENABLED(reg)) {
560 edac_dbg(0, "Memory mirror is enabled\n");
561 pvt->is_mirrored = true;
562 } else {
563 edac_dbg(0, "Memory mirror is disabled\n");
564 pvt->is_mirrored = false;
565 }
566
567 pci_read_config_dword(pvt->pci_ta, MCMTR, &pvt->info.mcmtr);
568 if (IS_LOCKSTEP_ENABLED(pvt->info.mcmtr)) {
569 edac_dbg(0, "Lockstep is enabled\n");
570 mode = EDAC_S8ECD8ED;
571 pvt->is_lockstep = true;
572 } else {
573 edac_dbg(0, "Lockstep is disabled\n");
574 mode = EDAC_S4ECD4ED;
575 pvt->is_lockstep = false;
576 }
577 if (IS_CLOSE_PG(pvt->info.mcmtr)) {
578 edac_dbg(0, "address map is on closed page mode\n");
579 pvt->is_close_pg = true;
580 } else {
581 edac_dbg(0, "address map is on open page mode\n");
582 pvt->is_close_pg = false;
583 }
584
585 if (pvt->pci_ddrio) {
586 pci_read_config_dword(pvt->pci_ddrio, pvt->info.rankcfgr,
587 &reg);
588 if (IS_RDIMM_ENABLED(reg)) {
589 /* FIXME: Can also be LRDIMM */
590 edac_dbg(0, "Memory is registered\n");
591 mtype = MEM_RDDR3;
592 } else {
593 edac_dbg(0, "Memory is unregistered\n");
594 mtype = MEM_DDR3;
595 }
596 } else {
597 edac_dbg(0, "Cannot determine memory type\n");
598 mtype = MEM_UNKNOWN;
599 }
600
601 /* On all supported DDR3 DIMM types, there are 8 banks available */
602 banks = 8;
603
604 for (i = 0; i < NUM_CHANNELS; i++) {
605 u32 mtr;
606
607 for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) {
608 dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
609 i, j, 0);
610 pci_read_config_dword(pvt->pci_tad[i],
611 mtr_regs[j], &mtr);
612 edac_dbg(4, "Channel #%d MTR%d = %x\n", i, j, mtr);
613 if (IS_DIMM_PRESENT(mtr)) {
614 pvt->channel[i].dimms++;
615
616 ranks = numrank(mtr);
617 rows = numrow(mtr);
618 cols = numcol(mtr);
619
620 /* DDR3 has 8 I/O banks */
621 size = ((u64)rows * cols * banks * ranks) >> (20 - 3);
622 npages = MiB_TO_PAGES(size);
623
624 edac_dbg(0, "mc#%d: channel %d, dimm %d, %Ld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n",
625 pvt->sbridge_dev->mc, i, j,
626 size, npages,
627 banks, ranks, rows, cols);
628
629 dimm->nr_pages = npages;
630 dimm->grain = 32;
631 dimm->dtype = (banks == 8) ? DEV_X8 : DEV_X4;
632 dimm->mtype = mtype;
633 dimm->edac_mode = mode;
634 snprintf(dimm->label, sizeof(dimm->label),
635 "CPU_SrcID#%u_Channel#%u_DIMM#%u",
636 pvt->sbridge_dev->source_id, i, j);
637 }
638 }
639 }
640
641 return 0;
642 }
643
644 static void get_memory_layout(const struct mem_ctl_info *mci)
645 {
646 struct sbridge_pvt *pvt = mci->pvt_info;
647 int i, j, k, n_sads, n_tads, sad_interl;
648 u32 reg;
649 u64 limit, prv = 0;
650 u64 tmp_mb;
651 u32 mb, kb;
652 u32 rir_way;
653
654 /*
655 * Step 1) Get TOLM/TOHM ranges
656 */
657
658 pvt->tolm = pvt->info.get_tolm(pvt);
659 tmp_mb = (1 + pvt->tolm) >> 20;
660
661 mb = div_u64_rem(tmp_mb, 1000, &kb);
662 edac_dbg(0, "TOLM: %u.%03u GB (0x%016Lx)\n", mb, kb, (u64)pvt->tolm);
663
664 /* Address range is already 45:25 */
665 pvt->tohm = pvt->info.get_tohm(pvt);
666 tmp_mb = (1 + pvt->tohm) >> 20;
667
668 mb = div_u64_rem(tmp_mb, 1000, &kb);
669 edac_dbg(0, "TOHM: %u.%03u GB (0x%016Lx)\n", mb, kb, (u64)pvt->tohm);
670
671 /*
672 * Step 2) Get SAD range and SAD Interleave list
673 * TAD registers contain the interleave wayness. However, it
674 * seems simpler to just discover it indirectly, with the
675 * algorithm bellow.
676 */
677 prv = 0;
678 for (n_sads = 0; n_sads < pvt->info.max_sad; n_sads++) {
679 /* SAD_LIMIT Address range is 45:26 */
680 pci_read_config_dword(pvt->pci_sad0, pvt->info.dram_rule[n_sads],
681 &reg);
682 limit = SAD_LIMIT(reg);
683
684 if (!DRAM_RULE_ENABLE(reg))
685 continue;
686
687 if (limit <= prv)
688 break;
689
690 tmp_mb = (limit + 1) >> 20;
691 mb = div_u64_rem(tmp_mb, 1000, &kb);
692 edac_dbg(0, "SAD#%d %s up to %u.%03u GB (0x%016Lx) Interleave: %s reg=0x%08x\n",
693 n_sads,
694 get_dram_attr(reg),
695 mb, kb,
696 ((u64)tmp_mb) << 20L,
697 INTERLEAVE_MODE(reg) ? "8:6" : "[8:6]XOR[18:16]",
698 reg);
699 prv = limit;
700
701 pci_read_config_dword(pvt->pci_sad0, pvt->info.interleave_list[n_sads],
702 &reg);
703 sad_interl = sad_pkg(reg, 0);
704 for (j = 0; j < 8; j++) {
705 if (j > 0 && sad_interl == sad_pkg(reg, j))
706 break;
707
708 edac_dbg(0, "SAD#%d, interleave #%d: %d\n",
709 n_sads, j, sad_pkg(reg, j));
710 }
711 }
712
713 /*
714 * Step 3) Get TAD range
715 */
716 prv = 0;
717 for (n_tads = 0; n_tads < MAX_TAD; n_tads++) {
718 pci_read_config_dword(pvt->pci_ha0, tad_dram_rule[n_tads],
719 &reg);
720 limit = TAD_LIMIT(reg);
721 if (limit <= prv)
722 break;
723 tmp_mb = (limit + 1) >> 20;
724
725 mb = div_u64_rem(tmp_mb, 1000, &kb);
726 edac_dbg(0, "TAD#%d: up to %u.%03u GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n",
727 n_tads, mb, kb,
728 ((u64)tmp_mb) << 20L,
729 (u32)TAD_SOCK(reg),
730 (u32)TAD_CH(reg),
731 (u32)TAD_TGT0(reg),
732 (u32)TAD_TGT1(reg),
733 (u32)TAD_TGT2(reg),
734 (u32)TAD_TGT3(reg),
735 reg);
736 prv = limit;
737 }
738
739 /*
740 * Step 4) Get TAD offsets, per each channel
741 */
742 for (i = 0; i < NUM_CHANNELS; i++) {
743 if (!pvt->channel[i].dimms)
744 continue;
745 for (j = 0; j < n_tads; j++) {
746 pci_read_config_dword(pvt->pci_tad[i],
747 tad_ch_nilv_offset[j],
748 &reg);
749 tmp_mb = TAD_OFFSET(reg) >> 20;
750 mb = div_u64_rem(tmp_mb, 1000, &kb);
751 edac_dbg(0, "TAD CH#%d, offset #%d: %u.%03u GB (0x%016Lx), reg=0x%08x\n",
752 i, j,
753 mb, kb,
754 ((u64)tmp_mb) << 20L,
755 reg);
756 }
757 }
758
759 /*
760 * Step 6) Get RIR Wayness/Limit, per each channel
761 */
762 for (i = 0; i < NUM_CHANNELS; i++) {
763 if (!pvt->channel[i].dimms)
764 continue;
765 for (j = 0; j < MAX_RIR_RANGES; j++) {
766 pci_read_config_dword(pvt->pci_tad[i],
767 rir_way_limit[j],
768 &reg);
769
770 if (!IS_RIR_VALID(reg))
771 continue;
772
773 tmp_mb = RIR_LIMIT(reg) >> 20;
774 rir_way = 1 << RIR_WAY(reg);
775 mb = div_u64_rem(tmp_mb, 1000, &kb);
776 edac_dbg(0, "CH#%d RIR#%d, limit: %u.%03u GB (0x%016Lx), way: %d, reg=0x%08x\n",
777 i, j,
778 mb, kb,
779 ((u64)tmp_mb) << 20L,
780 rir_way,
781 reg);
782
783 for (k = 0; k < rir_way; k++) {
784 pci_read_config_dword(pvt->pci_tad[i],
785 rir_offset[j][k],
786 &reg);
787 tmp_mb = RIR_OFFSET(reg) << 6;
788
789 mb = div_u64_rem(tmp_mb, 1000, &kb);
790 edac_dbg(0, "CH#%d RIR#%d INTL#%d, offset %u.%03u GB (0x%016Lx), tgt: %d, reg=0x%08x\n",
791 i, j, k,
792 mb, kb,
793 ((u64)tmp_mb) << 20L,
794 (u32)RIR_RNK_TGT(reg),
795 reg);
796 }
797 }
798 }
799 }
800
801 struct mem_ctl_info *get_mci_for_node_id(u8 node_id)
802 {
803 struct sbridge_dev *sbridge_dev;
804
805 list_for_each_entry(sbridge_dev, &sbridge_edac_list, list) {
806 if (sbridge_dev->node_id == node_id)
807 return sbridge_dev->mci;
808 }
809 return NULL;
810 }
811
812 static int get_memory_error_data(struct mem_ctl_info *mci,
813 u64 addr,
814 u8 *socket,
815 long *channel_mask,
816 u8 *rank,
817 char **area_type, char *msg)
818 {
819 struct mem_ctl_info *new_mci;
820 struct sbridge_pvt *pvt = mci->pvt_info;
821 int n_rir, n_sads, n_tads, sad_way, sck_xch;
822 int sad_interl, idx, base_ch;
823 int interleave_mode;
824 unsigned sad_interleave[pvt->info.max_interleave];
825 u32 reg;
826 u8 ch_way,sck_way;
827 u32 tad_offset;
828 u32 rir_way;
829 u32 mb, kb;
830 u64 ch_addr, offset, limit, prv = 0;
831
832
833 /*
834 * Step 0) Check if the address is at special memory ranges
835 * The check bellow is probably enough to fill all cases where
836 * the error is not inside a memory, except for the legacy
837 * range (e. g. VGA addresses). It is unlikely, however, that the
838 * memory controller would generate an error on that range.
839 */
840 if ((addr > (u64) pvt->tolm) && (addr < (1LL << 32))) {
841 sprintf(msg, "Error at TOLM area, on addr 0x%08Lx", addr);
842 return -EINVAL;
843 }
844 if (addr >= (u64)pvt->tohm) {
845 sprintf(msg, "Error at MMIOH area, on addr 0x%016Lx", addr);
846 return -EINVAL;
847 }
848
849 /*
850 * Step 1) Get socket
851 */
852 for (n_sads = 0; n_sads < pvt->info.max_sad; n_sads++) {
853 pci_read_config_dword(pvt->pci_sad0, pvt->info.dram_rule[n_sads],
854 &reg);
855
856 if (!DRAM_RULE_ENABLE(reg))
857 continue;
858
859 limit = SAD_LIMIT(reg);
860 if (limit <= prv) {
861 sprintf(msg, "Can't discover the memory socket");
862 return -EINVAL;
863 }
864 if (addr <= limit)
865 break;
866 prv = limit;
867 }
868 if (n_sads == pvt->info.max_sad) {
869 sprintf(msg, "Can't discover the memory socket");
870 return -EINVAL;
871 }
872 *area_type = get_dram_attr(reg);
873 interleave_mode = INTERLEAVE_MODE(reg);
874
875 pci_read_config_dword(pvt->pci_sad0, pvt->info.interleave_list[n_sads],
876 &reg);
877 sad_interl = sad_pkg(reg, 0);
878 for (sad_way = 0; sad_way < 8; sad_way++) {
879 if (sad_way > 0 && sad_interl == sad_pkg(reg, sad_way))
880 break;
881 sad_interleave[sad_way] = sad_pkg(reg, sad_way);
882 edac_dbg(0, "SAD interleave #%d: %d\n",
883 sad_way, sad_interleave[sad_way]);
884 }
885 edac_dbg(0, "mc#%d: Error detected on SAD#%d: address 0x%016Lx < 0x%016Lx, Interleave [%d:6]%s\n",
886 pvt->sbridge_dev->mc,
887 n_sads,
888 addr,
889 limit,
890 sad_way + 7,
891 interleave_mode ? "" : "XOR[18:16]");
892 if (interleave_mode)
893 idx = ((addr >> 6) ^ (addr >> 16)) & 7;
894 else
895 idx = (addr >> 6) & 7;
896 switch (sad_way) {
897 case 1:
898 idx = 0;
899 break;
900 case 2:
901 idx = idx & 1;
902 break;
903 case 4:
904 idx = idx & 3;
905 break;
906 case 8:
907 break;
908 default:
909 sprintf(msg, "Can't discover socket interleave");
910 return -EINVAL;
911 }
912 *socket = sad_interleave[idx];
913 edac_dbg(0, "SAD interleave index: %d (wayness %d) = CPU socket %d\n",
914 idx, sad_way, *socket);
915
916 /*
917 * Move to the proper node structure, in order to access the
918 * right PCI registers
919 */
920 new_mci = get_mci_for_node_id(*socket);
921 if (!new_mci) {
922 sprintf(msg, "Struct for socket #%u wasn't initialized",
923 *socket);
924 return -EINVAL;
925 }
926 mci = new_mci;
927 pvt = mci->pvt_info;
928
929 /*
930 * Step 2) Get memory channel
931 */
932 prv = 0;
933 for (n_tads = 0; n_tads < MAX_TAD; n_tads++) {
934 pci_read_config_dword(pvt->pci_ha0, tad_dram_rule[n_tads],
935 &reg);
936 limit = TAD_LIMIT(reg);
937 if (limit <= prv) {
938 sprintf(msg, "Can't discover the memory channel");
939 return -EINVAL;
940 }
941 if (addr <= limit)
942 break;
943 prv = limit;
944 }
945 ch_way = TAD_CH(reg) + 1;
946 sck_way = TAD_SOCK(reg) + 1;
947 /*
948 * FIXME: Is it right to always use channel 0 for offsets?
949 */
950 pci_read_config_dword(pvt->pci_tad[0],
951 tad_ch_nilv_offset[n_tads],
952 &tad_offset);
953
954 if (ch_way == 3)
955 idx = addr >> 6;
956 else
957 idx = addr >> (6 + sck_way);
958 idx = idx % ch_way;
959
960 /*
961 * FIXME: Shouldn't we use CHN_IDX_OFFSET() here, when ch_way == 3 ???
962 */
963 switch (idx) {
964 case 0:
965 base_ch = TAD_TGT0(reg);
966 break;
967 case 1:
968 base_ch = TAD_TGT1(reg);
969 break;
970 case 2:
971 base_ch = TAD_TGT2(reg);
972 break;
973 case 3:
974 base_ch = TAD_TGT3(reg);
975 break;
976 default:
977 sprintf(msg, "Can't discover the TAD target");
978 return -EINVAL;
979 }
980 *channel_mask = 1 << base_ch;
981
982 if (pvt->is_mirrored) {
983 *channel_mask |= 1 << ((base_ch + 2) % 4);
984 switch(ch_way) {
985 case 2:
986 case 4:
987 sck_xch = 1 << sck_way * (ch_way >> 1);
988 break;
989 default:
990 sprintf(msg, "Invalid mirror set. Can't decode addr");
991 return -EINVAL;
992 }
993 } else
994 sck_xch = (1 << sck_way) * ch_way;
995
996 if (pvt->is_lockstep)
997 *channel_mask |= 1 << ((base_ch + 1) % 4);
998
999 offset = TAD_OFFSET(tad_offset);
1000
1001 edac_dbg(0, "TAD#%d: address 0x%016Lx < 0x%016Lx, socket interleave %d, channel interleave %d (offset 0x%08Lx), index %d, base ch: %d, ch mask: 0x%02lx\n",
1002 n_tads,
1003 addr,
1004 limit,
1005 (u32)TAD_SOCK(reg),
1006 ch_way,
1007 offset,
1008 idx,
1009 base_ch,
1010 *channel_mask);
1011
1012 /* Calculate channel address */
1013 /* Remove the TAD offset */
1014
1015 if (offset > addr) {
1016 sprintf(msg, "Can't calculate ch addr: TAD offset 0x%08Lx is too high for addr 0x%08Lx!",
1017 offset, addr);
1018 return -EINVAL;
1019 }
1020 addr -= offset;
1021 /* Store the low bits [0:6] of the addr */
1022 ch_addr = addr & 0x7f;
1023 /* Remove socket wayness and remove 6 bits */
1024 addr >>= 6;
1025 addr = div_u64(addr, sck_xch);
1026 #if 0
1027 /* Divide by channel way */
1028 addr = addr / ch_way;
1029 #endif
1030 /* Recover the last 6 bits */
1031 ch_addr |= addr << 6;
1032
1033 /*
1034 * Step 3) Decode rank
1035 */
1036 for (n_rir = 0; n_rir < MAX_RIR_RANGES; n_rir++) {
1037 pci_read_config_dword(pvt->pci_tad[base_ch],
1038 rir_way_limit[n_rir],
1039 &reg);
1040
1041 if (!IS_RIR_VALID(reg))
1042 continue;
1043
1044 limit = RIR_LIMIT(reg);
1045 mb = div_u64_rem(limit >> 20, 1000, &kb);
1046 edac_dbg(0, "RIR#%d, limit: %u.%03u GB (0x%016Lx), way: %d\n",
1047 n_rir,
1048 mb, kb,
1049 limit,
1050 1 << RIR_WAY(reg));
1051 if (ch_addr <= limit)
1052 break;
1053 }
1054 if (n_rir == MAX_RIR_RANGES) {
1055 sprintf(msg, "Can't discover the memory rank for ch addr 0x%08Lx",
1056 ch_addr);
1057 return -EINVAL;
1058 }
1059 rir_way = RIR_WAY(reg);
1060 if (pvt->is_close_pg)
1061 idx = (ch_addr >> 6);
1062 else
1063 idx = (ch_addr >> 13); /* FIXME: Datasheet says to shift by 15 */
1064 idx %= 1 << rir_way;
1065
1066 pci_read_config_dword(pvt->pci_tad[base_ch],
1067 rir_offset[n_rir][idx],
1068 &reg);
1069 *rank = RIR_RNK_TGT(reg);
1070
1071 edac_dbg(0, "RIR#%d: channel address 0x%08Lx < 0x%08Lx, RIR interleave %d, index %d\n",
1072 n_rir,
1073 ch_addr,
1074 limit,
1075 rir_way,
1076 idx);
1077
1078 return 0;
1079 }
1080
1081 /****************************************************************************
1082 Device initialization routines: put/get, init/exit
1083 ****************************************************************************/
1084
1085 /*
1086 * sbridge_put_all_devices 'put' all the devices that we have
1087 * reserved via 'get'
1088 */
1089 static void sbridge_put_devices(struct sbridge_dev *sbridge_dev)
1090 {
1091 int i;
1092
1093 edac_dbg(0, "\n");
1094 for (i = 0; i < sbridge_dev->n_devs; i++) {
1095 struct pci_dev *pdev = sbridge_dev->pdev[i];
1096 if (!pdev)
1097 continue;
1098 edac_dbg(0, "Removing dev %02x:%02x.%d\n",
1099 pdev->bus->number,
1100 PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
1101 pci_dev_put(pdev);
1102 }
1103 }
1104
1105 static void sbridge_put_all_devices(void)
1106 {
1107 struct sbridge_dev *sbridge_dev, *tmp;
1108
1109 list_for_each_entry_safe(sbridge_dev, tmp, &sbridge_edac_list, list) {
1110 sbridge_put_devices(sbridge_dev);
1111 free_sbridge_dev(sbridge_dev);
1112 }
1113 }
1114
1115 /*
1116 * sbridge_get_all_devices Find and perform 'get' operation on the MCH's
1117 * device/functions we want to reference for this driver
1118 *
1119 * Need to 'get' device 16 func 1 and func 2
1120 */
1121 static int sbridge_get_onedevice(struct pci_dev **prev,
1122 u8 *num_mc,
1123 const struct pci_id_table *table,
1124 const unsigned devno)
1125 {
1126 struct sbridge_dev *sbridge_dev;
1127 const struct pci_id_descr *dev_descr = &table->descr[devno];
1128
1129 struct pci_dev *pdev = NULL;
1130 u8 bus = 0;
1131
1132 sbridge_printk(KERN_INFO,
1133 "Seeking for: dev %02x.%d PCI ID %04x:%04x\n",
1134 dev_descr->dev, dev_descr->func,
1135 PCI_VENDOR_ID_INTEL, dev_descr->dev_id);
1136
1137 pdev = pci_get_device(PCI_VENDOR_ID_INTEL,
1138 dev_descr->dev_id, *prev);
1139
1140 if (!pdev) {
1141 if (*prev) {
1142 *prev = pdev;
1143 return 0;
1144 }
1145
1146 if (dev_descr->optional)
1147 return 0;
1148
1149 if (devno == 0)
1150 return -ENODEV;
1151
1152 sbridge_printk(KERN_INFO,
1153 "Device not found: dev %02x.%d PCI ID %04x:%04x\n",
1154 dev_descr->dev, dev_descr->func,
1155 PCI_VENDOR_ID_INTEL, dev_descr->dev_id);
1156
1157 /* End of list, leave */
1158 return -ENODEV;
1159 }
1160 bus = pdev->bus->number;
1161
1162 sbridge_dev = get_sbridge_dev(bus);
1163 if (!sbridge_dev) {
1164 sbridge_dev = alloc_sbridge_dev(bus, table);
1165 if (!sbridge_dev) {
1166 pci_dev_put(pdev);
1167 return -ENOMEM;
1168 }
1169 (*num_mc)++;
1170 }
1171
1172 if (sbridge_dev->pdev[devno]) {
1173 sbridge_printk(KERN_ERR,
1174 "Duplicated device for "
1175 "dev %02x:%d.%d PCI ID %04x:%04x\n",
1176 bus, dev_descr->dev, dev_descr->func,
1177 PCI_VENDOR_ID_INTEL, dev_descr->dev_id);
1178 pci_dev_put(pdev);
1179 return -ENODEV;
1180 }
1181
1182 sbridge_dev->pdev[devno] = pdev;
1183
1184 /* Sanity check */
1185 if (unlikely(PCI_SLOT(pdev->devfn) != dev_descr->dev ||
1186 PCI_FUNC(pdev->devfn) != dev_descr->func)) {
1187 sbridge_printk(KERN_ERR,
1188 "Device PCI ID %04x:%04x "
1189 "has dev %02x:%d.%d instead of dev %02x:%02x.%d\n",
1190 PCI_VENDOR_ID_INTEL, dev_descr->dev_id,
1191 bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn),
1192 bus, dev_descr->dev, dev_descr->func);
1193 return -ENODEV;
1194 }
1195
1196 /* Be sure that the device is enabled */
1197 if (unlikely(pci_enable_device(pdev) < 0)) {
1198 sbridge_printk(KERN_ERR,
1199 "Couldn't enable "
1200 "dev %02x:%d.%d PCI ID %04x:%04x\n",
1201 bus, dev_descr->dev, dev_descr->func,
1202 PCI_VENDOR_ID_INTEL, dev_descr->dev_id);
1203 return -ENODEV;
1204 }
1205
1206 edac_dbg(0, "Detected dev %02x:%d.%d PCI ID %04x:%04x\n",
1207 bus, dev_descr->dev, dev_descr->func,
1208 PCI_VENDOR_ID_INTEL, dev_descr->dev_id);
1209
1210 /*
1211 * As stated on drivers/pci/search.c, the reference count for
1212 * @from is always decremented if it is not %NULL. So, as we need
1213 * to get all devices up to null, we need to do a get for the device
1214 */
1215 pci_dev_get(pdev);
1216
1217 *prev = pdev;
1218
1219 return 0;
1220 }
1221
1222 static int sbridge_get_all_devices(u8 *num_mc)
1223 {
1224 int i, rc;
1225 struct pci_dev *pdev = NULL;
1226 const struct pci_id_table *table = pci_dev_descr_sbridge_table;
1227
1228 while (table && table->descr) {
1229 for (i = 0; i < table->n_devs; i++) {
1230 pdev = NULL;
1231 do {
1232 rc = sbridge_get_onedevice(&pdev, num_mc,
1233 table, i);
1234 if (rc < 0) {
1235 if (i == 0) {
1236 i = table->n_devs;
1237 break;
1238 }
1239 sbridge_put_all_devices();
1240 return -ENODEV;
1241 }
1242 } while (pdev);
1243 }
1244 table++;
1245 }
1246
1247 return 0;
1248 }
1249
1250 static int mci_bind_devs(struct mem_ctl_info *mci,
1251 struct sbridge_dev *sbridge_dev)
1252 {
1253 struct sbridge_pvt *pvt = mci->pvt_info;
1254 struct pci_dev *pdev;
1255 int i, func, slot;
1256
1257 for (i = 0; i < sbridge_dev->n_devs; i++) {
1258 pdev = sbridge_dev->pdev[i];
1259 if (!pdev)
1260 continue;
1261 slot = PCI_SLOT(pdev->devfn);
1262 func = PCI_FUNC(pdev->devfn);
1263 switch (slot) {
1264 case 12:
1265 switch (func) {
1266 case 6:
1267 pvt->pci_sad0 = pdev;
1268 break;
1269 case 7:
1270 pvt->pci_sad1 = pdev;
1271 break;
1272 default:
1273 goto error;
1274 }
1275 break;
1276 case 13:
1277 switch (func) {
1278 case 6:
1279 pvt->pci_br0 = pdev;
1280 break;
1281 default:
1282 goto error;
1283 }
1284 break;
1285 case 14:
1286 switch (func) {
1287 case 0:
1288 pvt->pci_ha0 = pdev;
1289 break;
1290 default:
1291 goto error;
1292 }
1293 break;
1294 case 15:
1295 switch (func) {
1296 case 0:
1297 pvt->pci_ta = pdev;
1298 break;
1299 case 1:
1300 pvt->pci_ras = pdev;
1301 break;
1302 case 2:
1303 case 3:
1304 case 4:
1305 case 5:
1306 pvt->pci_tad[func - 2] = pdev;
1307 break;
1308 default:
1309 goto error;
1310 }
1311 break;
1312 case 17:
1313 switch (func) {
1314 case 0:
1315 pvt->pci_ddrio = pdev;
1316 break;
1317 default:
1318 goto error;
1319 }
1320 break;
1321 default:
1322 goto error;
1323 }
1324
1325 edac_dbg(0, "Associated PCI %02x.%02d.%d with dev = %p\n",
1326 sbridge_dev->bus,
1327 PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn),
1328 pdev);
1329 }
1330
1331 /* Check if everything were registered */
1332 if (!pvt->pci_sad0 || !pvt->pci_sad1 || !pvt->pci_ha0 ||
1333 !pvt-> pci_tad || !pvt->pci_ras || !pvt->pci_ta)
1334 goto enodev;
1335
1336 for (i = 0; i < NUM_CHANNELS; i++) {
1337 if (!pvt->pci_tad[i])
1338 goto enodev;
1339 }
1340 return 0;
1341
1342 enodev:
1343 sbridge_printk(KERN_ERR, "Some needed devices are missing\n");
1344 return -ENODEV;
1345
1346 error:
1347 sbridge_printk(KERN_ERR, "Device %d, function %d "
1348 "is out of the expected range\n",
1349 slot, func);
1350 return -EINVAL;
1351 }
1352
1353 /****************************************************************************
1354 Error check routines
1355 ****************************************************************************/
1356
1357 /*
1358 * While Sandy Bridge has error count registers, SMI BIOS read values from
1359 * and resets the counters. So, they are not reliable for the OS to read
1360 * from them. So, we have no option but to just trust on whatever MCE is
1361 * telling us about the errors.
1362 */
1363 static void sbridge_mce_output_error(struct mem_ctl_info *mci,
1364 const struct mce *m)
1365 {
1366 struct mem_ctl_info *new_mci;
1367 struct sbridge_pvt *pvt = mci->pvt_info;
1368 enum hw_event_mc_err_type tp_event;
1369 char *type, *optype, msg[256];
1370 bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
1371 bool overflow = GET_BITFIELD(m->status, 62, 62);
1372 bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
1373 bool recoverable = GET_BITFIELD(m->status, 56, 56);
1374 u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
1375 u32 mscod = GET_BITFIELD(m->status, 16, 31);
1376 u32 errcode = GET_BITFIELD(m->status, 0, 15);
1377 u32 channel = GET_BITFIELD(m->status, 0, 3);
1378 u32 optypenum = GET_BITFIELD(m->status, 4, 6);
1379 long channel_mask, first_channel;
1380 u8 rank, socket;
1381 int rc, dimm;
1382 char *area_type = NULL;
1383
1384 if (uncorrected_error) {
1385 if (ripv) {
1386 type = "FATAL";
1387 tp_event = HW_EVENT_ERR_FATAL;
1388 } else {
1389 type = "NON_FATAL";
1390 tp_event = HW_EVENT_ERR_UNCORRECTED;
1391 }
1392 } else {
1393 type = "CORRECTED";
1394 tp_event = HW_EVENT_ERR_CORRECTED;
1395 }
1396
1397 /*
1398 * According with Table 15-9 of the Intel Architecture spec vol 3A,
1399 * memory errors should fit in this mask:
1400 * 000f 0000 1mmm cccc (binary)
1401 * where:
1402 * f = Correction Report Filtering Bit. If 1, subsequent errors
1403 * won't be shown
1404 * mmm = error type
1405 * cccc = channel
1406 * If the mask doesn't match, report an error to the parsing logic
1407 */
1408 if (! ((errcode & 0xef80) == 0x80)) {
1409 optype = "Can't parse: it is not a mem";
1410 } else {
1411 switch (optypenum) {
1412 case 0:
1413 optype = "generic undef request error";
1414 break;
1415 case 1:
1416 optype = "memory read error";
1417 break;
1418 case 2:
1419 optype = "memory write error";
1420 break;
1421 case 3:
1422 optype = "addr/cmd error";
1423 break;
1424 case 4:
1425 optype = "memory scrubbing error";
1426 break;
1427 default:
1428 optype = "reserved";
1429 break;
1430 }
1431 }
1432
1433 rc = get_memory_error_data(mci, m->addr, &socket,
1434 &channel_mask, &rank, &area_type, msg);
1435 if (rc < 0)
1436 goto err_parsing;
1437 new_mci = get_mci_for_node_id(socket);
1438 if (!new_mci) {
1439 strcpy(msg, "Error: socket got corrupted!");
1440 goto err_parsing;
1441 }
1442 mci = new_mci;
1443 pvt = mci->pvt_info;
1444
1445 first_channel = find_first_bit(&channel_mask, NUM_CHANNELS);
1446
1447 if (rank < 4)
1448 dimm = 0;
1449 else if (rank < 8)
1450 dimm = 1;
1451 else
1452 dimm = 2;
1453
1454
1455 /*
1456 * FIXME: On some memory configurations (mirror, lockstep), the
1457 * Memory Controller can't point the error to a single DIMM. The
1458 * EDAC core should be handling the channel mask, in order to point
1459 * to the group of dimm's where the error may be happening.
1460 */
1461 snprintf(msg, sizeof(msg),
1462 "%s%s area:%s err_code:%04x:%04x socket:%d channel_mask:%ld rank:%d",
1463 overflow ? " OVERFLOW" : "",
1464 (uncorrected_error && recoverable) ? " recoverable" : "",
1465 area_type,
1466 mscod, errcode,
1467 socket,
1468 channel_mask,
1469 rank);
1470
1471 edac_dbg(0, "%s\n", msg);
1472
1473 /* FIXME: need support for channel mask */
1474
1475 /* Call the helper to output message */
1476 edac_mc_handle_error(tp_event, mci, core_err_cnt,
1477 m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
1478 channel, dimm, -1,
1479 optype, msg);
1480 return;
1481 err_parsing:
1482 edac_mc_handle_error(tp_event, mci, core_err_cnt, 0, 0, 0,
1483 -1, -1, -1,
1484 msg, "");
1485
1486 }
1487
1488 /*
1489 * sbridge_check_error Retrieve and process errors reported by the
1490 * hardware. Called by the Core module.
1491 */
1492 static void sbridge_check_error(struct mem_ctl_info *mci)
1493 {
1494 struct sbridge_pvt *pvt = mci->pvt_info;
1495 int i;
1496 unsigned count = 0;
1497 struct mce *m;
1498
1499 /*
1500 * MCE first step: Copy all mce errors into a temporary buffer
1501 * We use a double buffering here, to reduce the risk of
1502 * loosing an error.
1503 */
1504 smp_rmb();
1505 count = (pvt->mce_out + MCE_LOG_LEN - pvt->mce_in)
1506 % MCE_LOG_LEN;
1507 if (!count)
1508 return;
1509
1510 m = pvt->mce_outentry;
1511 if (pvt->mce_in + count > MCE_LOG_LEN) {
1512 unsigned l = MCE_LOG_LEN - pvt->mce_in;
1513
1514 memcpy(m, &pvt->mce_entry[pvt->mce_in], sizeof(*m) * l);
1515 smp_wmb();
1516 pvt->mce_in = 0;
1517 count -= l;
1518 m += l;
1519 }
1520 memcpy(m, &pvt->mce_entry[pvt->mce_in], sizeof(*m) * count);
1521 smp_wmb();
1522 pvt->mce_in += count;
1523
1524 smp_rmb();
1525 if (pvt->mce_overrun) {
1526 sbridge_printk(KERN_ERR, "Lost %d memory errors\n",
1527 pvt->mce_overrun);
1528 smp_wmb();
1529 pvt->mce_overrun = 0;
1530 }
1531
1532 /*
1533 * MCE second step: parse errors and display
1534 */
1535 for (i = 0; i < count; i++)
1536 sbridge_mce_output_error(mci, &pvt->mce_outentry[i]);
1537 }
1538
1539 /*
1540 * sbridge_mce_check_error Replicates mcelog routine to get errors
1541 * This routine simply queues mcelog errors, and
1542 * return. The error itself should be handled later
1543 * by sbridge_check_error.
1544 * WARNING: As this routine should be called at NMI time, extra care should
1545 * be taken to avoid deadlocks, and to be as fast as possible.
1546 */
1547 static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
1548 void *data)
1549 {
1550 struct mce *mce = (struct mce *)data;
1551 struct mem_ctl_info *mci;
1552 struct sbridge_pvt *pvt;
1553
1554 mci = get_mci_for_node_id(mce->socketid);
1555 if (!mci)
1556 return NOTIFY_BAD;
1557 pvt = mci->pvt_info;
1558
1559 /*
1560 * Just let mcelog handle it if the error is
1561 * outside the memory controller. A memory error
1562 * is indicated by bit 7 = 1 and bits = 8-11,13-15 = 0.
1563 * bit 12 has an special meaning.
1564 */
1565 if ((mce->status & 0xefff) >> 7 != 1)
1566 return NOTIFY_DONE;
1567
1568 printk("sbridge: HANDLING MCE MEMORY ERROR\n");
1569
1570 printk("CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
1571 mce->extcpu, mce->mcgstatus, mce->bank, mce->status);
1572 printk("TSC %llx ", mce->tsc);
1573 printk("ADDR %llx ", mce->addr);
1574 printk("MISC %llx ", mce->misc);
1575
1576 printk("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
1577 mce->cpuvendor, mce->cpuid, mce->time,
1578 mce->socketid, mce->apicid);
1579
1580 /* Only handle if it is the right mc controller */
1581 if (cpu_data(mce->cpu).phys_proc_id != pvt->sbridge_dev->mc)
1582 return NOTIFY_DONE;
1583
1584 smp_rmb();
1585 if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) {
1586 smp_wmb();
1587 pvt->mce_overrun++;
1588 return NOTIFY_DONE;
1589 }
1590
1591 /* Copy memory error at the ringbuffer */
1592 memcpy(&pvt->mce_entry[pvt->mce_out], mce, sizeof(*mce));
1593 smp_wmb();
1594 pvt->mce_out = (pvt->mce_out + 1) % MCE_LOG_LEN;
1595
1596 /* Handle fatal errors immediately */
1597 if (mce->mcgstatus & 1)
1598 sbridge_check_error(mci);
1599
1600 /* Advice mcelog that the error were handled */
1601 return NOTIFY_STOP;
1602 }
1603
1604 static struct notifier_block sbridge_mce_dec = {
1605 .notifier_call = sbridge_mce_check_error,
1606 };
1607
1608 /****************************************************************************
1609 EDAC register/unregister logic
1610 ****************************************************************************/
1611
1612 static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev)
1613 {
1614 struct mem_ctl_info *mci = sbridge_dev->mci;
1615 struct sbridge_pvt *pvt;
1616
1617 if (unlikely(!mci || !mci->pvt_info)) {
1618 edac_dbg(0, "MC: dev = %p\n", &sbridge_dev->pdev[0]->dev);
1619
1620 sbridge_printk(KERN_ERR, "Couldn't find mci handler\n");
1621 return;
1622 }
1623
1624 pvt = mci->pvt_info;
1625
1626 edac_dbg(0, "MC: mci = %p, dev = %p\n",
1627 mci, &sbridge_dev->pdev[0]->dev);
1628
1629 /* Remove MC sysfs nodes */
1630 edac_mc_del_mc(mci->pdev);
1631
1632 edac_dbg(1, "%s: free mci struct\n", mci->ctl_name);
1633 kfree(mci->ctl_name);
1634 edac_mc_free(mci);
1635 sbridge_dev->mci = NULL;
1636 }
1637
1638 static int sbridge_register_mci(struct sbridge_dev *sbridge_dev)
1639 {
1640 struct mem_ctl_info *mci;
1641 struct edac_mc_layer layers[2];
1642 struct sbridge_pvt *pvt;
1643 int rc;
1644
1645 /* Check the number of active and not disabled channels */
1646 rc = check_if_ecc_is_active(sbridge_dev->bus);
1647 if (unlikely(rc < 0))
1648 return rc;
1649
1650 /* allocate a new MC control structure */
1651 layers[0].type = EDAC_MC_LAYER_CHANNEL;
1652 layers[0].size = NUM_CHANNELS;
1653 layers[0].is_virt_csrow = false;
1654 layers[1].type = EDAC_MC_LAYER_SLOT;
1655 layers[1].size = MAX_DIMMS;
1656 layers[1].is_virt_csrow = true;
1657 mci = edac_mc_alloc(sbridge_dev->mc, ARRAY_SIZE(layers), layers,
1658 sizeof(*pvt));
1659
1660 if (unlikely(!mci))
1661 return -ENOMEM;
1662
1663 edac_dbg(0, "MC: mci = %p, dev = %p\n",
1664 mci, &sbridge_dev->pdev[0]->dev);
1665
1666 pvt = mci->pvt_info;
1667 memset(pvt, 0, sizeof(*pvt));
1668
1669 /* Associate sbridge_dev and mci for future usage */
1670 pvt->sbridge_dev = sbridge_dev;
1671 sbridge_dev->mci = mci;
1672
1673 mci->mtype_cap = MEM_FLAG_DDR3;
1674 mci->edac_ctl_cap = EDAC_FLAG_NONE;
1675 mci->edac_cap = EDAC_FLAG_NONE;
1676 mci->mod_name = "sbridge_edac.c";
1677 mci->mod_ver = SBRIDGE_REVISION;
1678 mci->ctl_name = kasprintf(GFP_KERNEL, "Sandy Bridge Socket#%d", mci->mc_idx);
1679 mci->dev_name = pci_name(sbridge_dev->pdev[0]);
1680 mci->ctl_page_to_phys = NULL;
1681 pvt->info.get_tolm = sbridge_get_tolm;
1682 pvt->info.get_tohm = sbridge_get_tohm;
1683 pvt->info.dram_rule = sbridge_dram_rule;
1684 pvt->info.max_sad = ARRAY_SIZE(sbridge_dram_rule);
1685 pvt->info.interleave_list = sbridge_interleave_list;
1686 pvt->info.max_interleave = ARRAY_SIZE(sbridge_interleave_list);
1687
1688 /* Set the function pointer to an actual operation function */
1689 mci->edac_check = sbridge_check_error;
1690
1691 /* Store pci devices at mci for faster access */
1692 rc = mci_bind_devs(mci, sbridge_dev);
1693 if (unlikely(rc < 0))
1694 goto fail0;
1695
1696 /* Get dimm basic config and the memory layout */
1697 get_dimm_config(mci);
1698 get_memory_layout(mci);
1699
1700 /* record ptr to the generic device */
1701 mci->pdev = &sbridge_dev->pdev[0]->dev;
1702
1703 /* add this new MC control structure to EDAC's list of MCs */
1704 if (unlikely(edac_mc_add_mc(mci))) {
1705 edac_dbg(0, "MC: failed edac_mc_add_mc()\n");
1706 rc = -EINVAL;
1707 goto fail0;
1708 }
1709
1710 return 0;
1711
1712 fail0:
1713 kfree(mci->ctl_name);
1714 edac_mc_free(mci);
1715 sbridge_dev->mci = NULL;
1716 return rc;
1717 }
1718
1719 /*
1720 * sbridge_probe Probe for ONE instance of device to see if it is
1721 * present.
1722 * return:
1723 * 0 for FOUND a device
1724 * < 0 for error code
1725 */
1726
1727 static int sbridge_probe(struct pci_dev *pdev, const struct pci_device_id *id)
1728 {
1729 int rc;
1730 u8 mc, num_mc = 0;
1731 struct sbridge_dev *sbridge_dev;
1732
1733 /* get the pci devices we want to reserve for our use */
1734 mutex_lock(&sbridge_edac_lock);
1735
1736 /*
1737 * All memory controllers are allocated at the first pass.
1738 */
1739 if (unlikely(probed >= 1)) {
1740 mutex_unlock(&sbridge_edac_lock);
1741 return -ENODEV;
1742 }
1743 probed++;
1744
1745 rc = sbridge_get_all_devices(&num_mc);
1746 if (unlikely(rc < 0))
1747 goto fail0;
1748 mc = 0;
1749
1750 list_for_each_entry(sbridge_dev, &sbridge_edac_list, list) {
1751 edac_dbg(0, "Registering MC#%d (%d of %d)\n",
1752 mc, mc + 1, num_mc);
1753 sbridge_dev->mc = mc++;
1754 rc = sbridge_register_mci(sbridge_dev);
1755 if (unlikely(rc < 0))
1756 goto fail1;
1757 }
1758
1759 sbridge_printk(KERN_INFO, "Driver loaded.\n");
1760
1761 mutex_unlock(&sbridge_edac_lock);
1762 return 0;
1763
1764 fail1:
1765 list_for_each_entry(sbridge_dev, &sbridge_edac_list, list)
1766 sbridge_unregister_mci(sbridge_dev);
1767
1768 sbridge_put_all_devices();
1769 fail0:
1770 mutex_unlock(&sbridge_edac_lock);
1771 return rc;
1772 }
1773
1774 /*
1775 * sbridge_remove destructor for one instance of device
1776 *
1777 */
1778 static void sbridge_remove(struct pci_dev *pdev)
1779 {
1780 struct sbridge_dev *sbridge_dev;
1781
1782 edac_dbg(0, "\n");
1783
1784 /*
1785 * we have a trouble here: pdev value for removal will be wrong, since
1786 * it will point to the X58 register used to detect that the machine
1787 * is a Nehalem or upper design. However, due to the way several PCI
1788 * devices are grouped together to provide MC functionality, we need
1789 * to use a different method for releasing the devices
1790 */
1791
1792 mutex_lock(&sbridge_edac_lock);
1793
1794 if (unlikely(!probed)) {
1795 mutex_unlock(&sbridge_edac_lock);
1796 return;
1797 }
1798
1799 list_for_each_entry(sbridge_dev, &sbridge_edac_list, list)
1800 sbridge_unregister_mci(sbridge_dev);
1801
1802 /* Release PCI resources */
1803 sbridge_put_all_devices();
1804
1805 probed--;
1806
1807 mutex_unlock(&sbridge_edac_lock);
1808 }
1809
1810 MODULE_DEVICE_TABLE(pci, sbridge_pci_tbl);
1811
1812 /*
1813 * sbridge_driver pci_driver structure for this module
1814 *
1815 */
1816 static struct pci_driver sbridge_driver = {
1817 .name = "sbridge_edac",
1818 .probe = sbridge_probe,
1819 .remove = sbridge_remove,
1820 .id_table = sbridge_pci_tbl,
1821 };
1822
1823 /*
1824 * sbridge_init Module entry function
1825 * Try to initialize this module for its devices
1826 */
1827 static int __init sbridge_init(void)
1828 {
1829 int pci_rc;
1830
1831 edac_dbg(2, "\n");
1832
1833 /* Ensure that the OPSTATE is set correctly for POLL or NMI */
1834 opstate_init();
1835
1836 pci_rc = pci_register_driver(&sbridge_driver);
1837
1838 if (pci_rc >= 0) {
1839 mce_register_decode_chain(&sbridge_mce_dec);
1840 return 0;
1841 }
1842
1843 sbridge_printk(KERN_ERR, "Failed to register device with error %d.\n",
1844 pci_rc);
1845
1846 return pci_rc;
1847 }
1848
1849 /*
1850 * sbridge_exit() Module exit function
1851 * Unregister the driver
1852 */
1853 static void __exit sbridge_exit(void)
1854 {
1855 edac_dbg(2, "\n");
1856 pci_unregister_driver(&sbridge_driver);
1857 mce_unregister_decode_chain(&sbridge_mce_dec);
1858 }
1859
1860 module_init(sbridge_init);
1861 module_exit(sbridge_exit);
1862
1863 module_param(edac_op_state, int, 0444);
1864 MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
1865
1866 MODULE_LICENSE("GPL");
1867 MODULE_AUTHOR("Mauro Carvalho Chehab <mchehab@redhat.com>");
1868 MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com)");
1869 MODULE_DESCRIPTION("MC Driver for Intel Sandy Bridge memory controllers - "
1870 SBRIDGE_REVISION);
This page took 0.436998 seconds and 5 git commands to generate.