Commit | Line | Data |
---|---|---|
0ef95b41 MS |
1 | /* |
2 | * OPAL hypervisor Maintenance interrupt handling support in PowreNV. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License as published by | |
6 | * the Free Software Foundation; either version 2 of the License, or | |
7 | * (at your option) any later version. | |
8 | * | |
9 | * This program is distributed in the hope that it will be useful, | |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | * GNU General Public License for more details. | |
13 | * | |
14 | * You should have received a copy of the GNU General Public License | |
15 | * along with this program; If not, see <http://www.gnu.org/licenses/>. | |
16 | * | |
17 | * Copyright 2014 IBM Corporation | |
18 | * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> | |
19 | */ | |
20 | ||
21 | #undef DEBUG | |
22 | ||
23 | #include <linux/kernel.h> | |
24 | #include <linux/init.h> | |
25 | #include <linux/of.h> | |
26 | #include <linux/mm.h> | |
27 | #include <linux/slab.h> | |
28 | ||
29 | #include <asm/opal.h> | |
30 | #include <asm/cputable.h> | |
c1c8a92f | 31 | #include <asm/machdep.h> |
0ef95b41 MS |
32 | |
33 | static int opal_hmi_handler_nb_init; | |
34 | struct OpalHmiEvtNode { | |
35 | struct list_head list; | |
36 | struct OpalHMIEvent hmi_evt; | |
37 | }; | |
c33e11d0 MS |
38 | |
39 | struct xstop_reason { | |
40 | uint32_t xstop_reason; | |
41 | const char *unit_failed; | |
42 | const char *description; | |
43 | }; | |
44 | ||
0ef95b41 MS |
45 | static LIST_HEAD(opal_hmi_evt_list); |
46 | static DEFINE_SPINLOCK(opal_hmi_evt_lock); | |
47 | ||
c33e11d0 MS |
48 | static void print_core_checkstop_reason(const char *level, |
49 | struct OpalHMIEvent *hmi_evt) | |
50 | { | |
51 | int i; | |
52 | static const struct xstop_reason xstop_reason[] = { | |
53 | { CORE_CHECKSTOP_IFU_REGFILE, "IFU", | |
54 | "RegFile core check stop" }, | |
55 | { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" }, | |
56 | { CORE_CHECKSTOP_PC_DURING_RECOV, "PC", | |
57 | "Core checkstop during recovery" }, | |
58 | { CORE_CHECKSTOP_ISU_REGFILE, "ISU", | |
59 | "RegFile core check stop (mapper error)" }, | |
60 | { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" }, | |
61 | { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" }, | |
62 | { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" }, | |
63 | { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC", | |
64 | "Recovery in maintenance mode" }, | |
65 | { CORE_CHECKSTOP_LSU_REGFILE, "LSU", | |
66 | "RegFile core check stop" }, | |
67 | { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC", | |
68 | "Forward Progress Error" }, | |
69 | { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" }, | |
70 | { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" }, | |
71 | { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC", | |
72 | "Hypervisor Resource error - core check stop" }, | |
73 | { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC", | |
74 | "Hang Recovery Failed (core check stop)" }, | |
75 | { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC", | |
76 | "Ambiguous Hang Detected (unknown source)" }, | |
77 | { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC", | |
78 | "Debug Trigger Error inject" }, | |
79 | { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC", | |
80 | "Hypervisor check stop via SPRC/SPRD" }, | |
81 | }; | |
82 | ||
83 | /* Validity check */ | |
84 | if (!hmi_evt->u.xstop_error.xstop_reason) { | |
85 | printk("%s Unknown Core check stop.\n", level); | |
86 | return; | |
87 | } | |
88 | ||
89 | printk("%s CPU PIR: %08x\n", level, | |
90 | be32_to_cpu(hmi_evt->u.xstop_error.u.pir)); | |
91 | for (i = 0; i < ARRAY_SIZE(xstop_reason); i++) | |
92 | if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) & | |
93 | xstop_reason[i].xstop_reason) | |
94 | printk("%s [Unit: %-3s] %s\n", level, | |
95 | xstop_reason[i].unit_failed, | |
96 | xstop_reason[i].description); | |
97 | } | |
98 | ||
99 | static void print_nx_checkstop_reason(const char *level, | |
100 | struct OpalHMIEvent *hmi_evt) | |
101 | { | |
102 | int i; | |
103 | static const struct xstop_reason xstop_reason[] = { | |
104 | { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine", | |
105 | "SHM invalid state error" }, | |
106 | { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine", | |
107 | "DMA invalid state error bit 15" }, | |
108 | { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine", | |
109 | "DMA invalid state error bit 16" }, | |
110 | { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine", | |
111 | "Channel 0 invalid state error" }, | |
112 | { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine", | |
113 | "Channel 1 invalid state error" }, | |
114 | { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine", | |
115 | "Channel 2 invalid state error" }, | |
116 | { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine", | |
117 | "Channel 3 invalid state error" }, | |
118 | { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine", | |
119 | "Channel 4 invalid state error" }, | |
120 | { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine", | |
121 | "Channel 5 invalid state error" }, | |
122 | { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine", | |
123 | "Channel 6 invalid state error" }, | |
124 | { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine", | |
125 | "Channel 7 invalid state error" }, | |
126 | { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine", | |
127 | "UE error on CRB(CSB address, CCB)" }, | |
128 | { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine", | |
129 | "SUE error on CRB(CSB address, CCB)" }, | |
130 | { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface", | |
131 | "CRB Kill ISN received while holding ISN with UE error" }, | |
132 | }; | |
133 | ||
134 | /* Validity check */ | |
135 | if (!hmi_evt->u.xstop_error.xstop_reason) { | |
136 | printk("%s Unknown NX check stop.\n", level); | |
137 | return; | |
138 | } | |
139 | ||
140 | printk("%s NX checkstop on CHIP ID: %x\n", level, | |
141 | be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id)); | |
142 | for (i = 0; i < ARRAY_SIZE(xstop_reason); i++) | |
143 | if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) & | |
144 | xstop_reason[i].xstop_reason) | |
145 | printk("%s [Unit: %-3s] %s\n", level, | |
146 | xstop_reason[i].unit_failed, | |
147 | xstop_reason[i].description); | |
148 | } | |
149 | ||
150 | static void print_checkstop_reason(const char *level, | |
151 | struct OpalHMIEvent *hmi_evt) | |
152 | { | |
153 | switch (hmi_evt->u.xstop_error.xstop_type) { | |
154 | case CHECKSTOP_TYPE_CORE: | |
155 | print_core_checkstop_reason(level, hmi_evt); | |
156 | break; | |
157 | case CHECKSTOP_TYPE_NX: | |
158 | print_nx_checkstop_reason(level, hmi_evt); | |
159 | break; | |
160 | case CHECKSTOP_TYPE_UNKNOWN: | |
161 | printk("%s Unknown Malfunction Alert.\n", level); | |
162 | break; | |
163 | } | |
164 | } | |
165 | ||
0ef95b41 MS |
166 | static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt) |
167 | { | |
168 | const char *level, *sevstr, *error_info; | |
169 | static const char *hmi_error_types[] = { | |
170 | "Malfunction Alert", | |
171 | "Processor Recovery done", | |
172 | "Processor recovery occurred again", | |
173 | "Processor recovery occurred for masked error", | |
174 | "Timer facility experienced an error", | |
175 | "TFMR SPR is corrupted", | |
176 | "UPS (Uniterrupted Power System) Overflow indication", | |
177 | "An XSCOM operation failure", | |
178 | "An XSCOM operation completed", | |
179 | "SCOM has set a reserved FIR bit to cause recovery", | |
180 | "Debug trigger has set a reserved FIR bit to cause recovery", | |
181 | "A hypervisor resource error occurred" | |
182 | }; | |
183 | ||
184 | /* Print things out */ | |
6acbc5a1 | 185 | if (hmi_evt->version < OpalHMIEvt_V1) { |
0ef95b41 MS |
186 | pr_err("HMI Interrupt, Unknown event version %d !\n", |
187 | hmi_evt->version); | |
188 | return; | |
189 | } | |
190 | switch (hmi_evt->severity) { | |
191 | case OpalHMI_SEV_NO_ERROR: | |
192 | level = KERN_INFO; | |
193 | sevstr = "Harmless"; | |
194 | break; | |
195 | case OpalHMI_SEV_WARNING: | |
196 | level = KERN_WARNING; | |
197 | sevstr = ""; | |
198 | break; | |
199 | case OpalHMI_SEV_ERROR_SYNC: | |
200 | level = KERN_ERR; | |
201 | sevstr = "Severe"; | |
202 | break; | |
203 | case OpalHMI_SEV_FATAL: | |
204 | default: | |
205 | level = KERN_ERR; | |
206 | sevstr = "Fatal"; | |
207 | break; | |
208 | } | |
209 | ||
210 | printk("%s%s Hypervisor Maintenance interrupt [%s]\n", | |
211 | level, sevstr, | |
212 | hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ? | |
213 | "Recovered" : "Not recovered"); | |
214 | error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ? | |
215 | hmi_error_types[hmi_evt->type] | |
216 | : "Unknown"; | |
217 | printk("%s Error detail: %s\n", level, error_info); | |
218 | printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer)); | |
219 | if ((hmi_evt->type == OpalHMI_ERROR_TFAC) || | |
220 | (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY)) | |
221 | printk("%s TFMR: %016llx\n", level, | |
222 | be64_to_cpu(hmi_evt->tfmr)); | |
c33e11d0 MS |
223 | |
224 | if (hmi_evt->version < OpalHMIEvt_V2) | |
225 | return; | |
226 | ||
227 | /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */ | |
228 | if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT) | |
229 | print_checkstop_reason(level, hmi_evt); | |
0ef95b41 MS |
230 | } |
231 | ||
232 | static void hmi_event_handler(struct work_struct *work) | |
233 | { | |
234 | unsigned long flags; | |
235 | struct OpalHMIEvent *hmi_evt; | |
236 | struct OpalHmiEvtNode *msg_node; | |
237 | uint8_t disposition; | |
1852ae27 MS |
238 | struct opal_msg msg; |
239 | int unrecoverable = 0; | |
0ef95b41 MS |
240 | |
241 | spin_lock_irqsave(&opal_hmi_evt_lock, flags); | |
242 | while (!list_empty(&opal_hmi_evt_list)) { | |
243 | msg_node = list_entry(opal_hmi_evt_list.next, | |
244 | struct OpalHmiEvtNode, list); | |
245 | list_del(&msg_node->list); | |
246 | spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); | |
247 | ||
248 | hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt; | |
249 | print_hmi_event_info(hmi_evt); | |
250 | disposition = hmi_evt->disposition; | |
251 | kfree(msg_node); | |
252 | ||
253 | /* | |
254 | * Check if HMI event has been recovered or not. If not | |
1852ae27 MS |
255 | * then kernel can't continue, we need to panic. |
256 | * But before we do that, display all the HMI event | |
257 | * available on the list and set unrecoverable flag to 1. | |
0ef95b41 MS |
258 | */ |
259 | if (disposition != OpalHMI_DISPOSITION_RECOVERED) | |
1852ae27 | 260 | unrecoverable = 1; |
0ef95b41 MS |
261 | |
262 | spin_lock_irqsave(&opal_hmi_evt_lock, flags); | |
263 | } | |
264 | spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); | |
1852ae27 MS |
265 | |
266 | if (unrecoverable) { | |
62521ea6 MS |
267 | int ret; |
268 | ||
1852ae27 MS |
269 | /* Pull all HMI events from OPAL before we panic. */ |
270 | while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) { | |
271 | u32 type; | |
272 | ||
273 | type = be32_to_cpu(msg.msg_type); | |
274 | ||
275 | /* skip if not HMI event */ | |
276 | if (type != OPAL_MSG_HMI_EVT) | |
277 | continue; | |
278 | ||
279 | /* HMI event info starts from param[0] */ | |
280 | hmi_evt = (struct OpalHMIEvent *)&msg.params[0]; | |
281 | print_hmi_event_info(hmi_evt); | |
282 | } | |
62521ea6 MS |
283 | |
284 | /* | |
285 | * Unrecoverable HMI exception. We need to inform BMC/OCC | |
286 | * about this error so that it can collect relevant data | |
287 | * for error analysis before rebooting. | |
288 | */ | |
289 | ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, | |
290 | "Unrecoverable HMI exception"); | |
291 | if (ret == OPAL_UNSUPPORTED) { | |
292 | pr_emerg("Reboot type %d not supported\n", | |
293 | OPAL_REBOOT_PLATFORM_ERROR); | |
294 | } | |
295 | ||
296 | /* | |
297 | * Fall through and panic if opal_cec_reboot2() returns | |
298 | * OPAL_UNSUPPORTED. | |
299 | */ | |
1852ae27 MS |
300 | panic("Unrecoverable HMI exception"); |
301 | } | |
0ef95b41 MS |
302 | } |
303 | ||
304 | static DECLARE_WORK(hmi_event_work, hmi_event_handler); | |
305 | /* | |
306 | * opal_handle_hmi_event - notifier handler that queues up HMI events | |
307 | * to be preocessed later. | |
308 | */ | |
309 | static int opal_handle_hmi_event(struct notifier_block *nb, | |
310 | unsigned long msg_type, void *msg) | |
311 | { | |
312 | unsigned long flags; | |
313 | struct OpalHMIEvent *hmi_evt; | |
314 | struct opal_msg *hmi_msg = msg; | |
315 | struct OpalHmiEvtNode *msg_node; | |
316 | ||
317 | /* Sanity Checks */ | |
318 | if (msg_type != OPAL_MSG_HMI_EVT) | |
319 | return 0; | |
320 | ||
321 | /* HMI event info starts from param[0] */ | |
322 | hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0]; | |
323 | ||
324 | /* Delay the logging of HMI events to workqueue. */ | |
325 | msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); | |
326 | if (!msg_node) { | |
327 | pr_err("HMI: out of memory, Opal message event not handled\n"); | |
328 | return -ENOMEM; | |
329 | } | |
330 | memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent)); | |
331 | ||
332 | spin_lock_irqsave(&opal_hmi_evt_lock, flags); | |
333 | list_add(&msg_node->list, &opal_hmi_evt_list); | |
334 | spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); | |
335 | ||
336 | schedule_work(&hmi_event_work); | |
337 | return 0; | |
338 | } | |
339 | ||
340 | static struct notifier_block opal_hmi_handler_nb = { | |
341 | .notifier_call = opal_handle_hmi_event, | |
342 | .next = NULL, | |
343 | .priority = 0, | |
344 | }; | |
345 | ||
96e023e7 | 346 | int __init opal_hmi_handler_init(void) |
0ef95b41 MS |
347 | { |
348 | int ret; | |
349 | ||
350 | if (!opal_hmi_handler_nb_init) { | |
351 | ret = opal_message_notifier_register( | |
352 | OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb); | |
353 | if (ret) { | |
354 | pr_err("%s: Can't register OPAL event notifier (%d)\n", | |
355 | __func__, ret); | |
356 | return ret; | |
357 | } | |
358 | opal_hmi_handler_nb_init = 1; | |
359 | } | |
360 | return 0; | |
361 | } |