2 * Copyright 2014 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
24 #include <linux/slab.h>
25 #include <linux/mutex.h>
26 #include "kfd_device_queue_manager.h"
27 #include "kfd_kernel_queue.h"
29 #include "kfd_pm4_headers.h"
30 #include "kfd_pm4_headers_vi.h"
31 #include "kfd_pm4_opcodes.h"
33 static inline void inc_wptr(unsigned int *wptr
, unsigned int increment_bytes
,
34 unsigned int buffer_size_bytes
)
36 unsigned int temp
= *wptr
+ increment_bytes
/ sizeof(uint32_t);
38 BUG_ON((temp
* sizeof(uint32_t)) > buffer_size_bytes
);
42 static unsigned int build_pm4_header(unsigned int opcode
, size_t packet_size
)
44 union PM4_MES_TYPE_3_HEADER header
;
47 header
.opcode
= opcode
;
48 header
.count
= packet_size
/sizeof(uint32_t) - 2;
49 header
.type
= PM4_TYPE_3
;
54 static void pm_calc_rlib_size(struct packet_manager
*pm
,
55 unsigned int *rlib_size
,
56 bool *over_subscription
)
58 unsigned int process_count
, queue_count
;
59 unsigned int map_queue_size
;
61 BUG_ON(!pm
|| !rlib_size
|| !over_subscription
);
63 process_count
= pm
->dqm
->processes_count
;
64 queue_count
= pm
->dqm
->queue_count
;
66 /* check if there is over subscription*/
67 *over_subscription
= false;
68 if ((process_count
> 1) ||
69 queue_count
> PIPE_PER_ME_CP_SCHEDULING
* QUEUES_PER_PIPE
) {
70 *over_subscription
= true;
71 pr_debug("kfd: over subscribed runlist\n");
75 (pm
->dqm
->dev
->device_info
->asic_family
== CHIP_CARRIZO
) ?
76 sizeof(struct pm4_mes_map_queues
) :
77 sizeof(struct pm4_map_queues
);
78 /* calculate run list ib allocation size */
79 *rlib_size
= process_count
* sizeof(struct pm4_map_process
) +
80 queue_count
* map_queue_size
;
83 * Increase the allocation size in case we need a chained run list
84 * when over subscription
86 if (*over_subscription
)
87 *rlib_size
+= sizeof(struct pm4_runlist
);
89 pr_debug("kfd: runlist ib size %d\n", *rlib_size
);
92 static int pm_allocate_runlist_ib(struct packet_manager
*pm
,
93 unsigned int **rl_buffer
,
94 uint64_t *rl_gpu_buffer
,
95 unsigned int *rl_buffer_size
,
96 bool *is_over_subscription
)
101 BUG_ON(pm
->allocated
== true);
102 BUG_ON(is_over_subscription
== NULL
);
104 pm_calc_rlib_size(pm
, rl_buffer_size
, is_over_subscription
);
106 retval
= kfd_gtt_sa_allocate(pm
->dqm
->dev
, *rl_buffer_size
,
110 pr_err("kfd: failed to allocate runlist IB\n");
114 *(void **)rl_buffer
= pm
->ib_buffer_obj
->cpu_ptr
;
115 *rl_gpu_buffer
= pm
->ib_buffer_obj
->gpu_addr
;
117 memset(*rl_buffer
, 0, *rl_buffer_size
);
118 pm
->allocated
= true;
122 static int pm_create_runlist(struct packet_manager
*pm
, uint32_t *buffer
,
123 uint64_t ib
, size_t ib_size_in_dwords
, bool chain
)
125 struct pm4_runlist
*packet
;
127 BUG_ON(!pm
|| !buffer
|| !ib
);
129 packet
= (struct pm4_runlist
*)buffer
;
131 memset(buffer
, 0, sizeof(struct pm4_runlist
));
132 packet
->header
.u32all
= build_pm4_header(IT_RUN_LIST
,
133 sizeof(struct pm4_runlist
));
135 packet
->bitfields4
.ib_size
= ib_size_in_dwords
;
136 packet
->bitfields4
.chain
= chain
? 1 : 0;
137 packet
->bitfields4
.offload_polling
= 0;
138 packet
->bitfields4
.valid
= 1;
139 packet
->ordinal2
= lower_32_bits(ib
);
140 packet
->bitfields3
.ib_base_hi
= upper_32_bits(ib
);
145 static int pm_create_map_process(struct packet_manager
*pm
, uint32_t *buffer
,
146 struct qcm_process_device
*qpd
)
148 struct pm4_map_process
*packet
;
152 BUG_ON(!pm
|| !buffer
|| !qpd
);
154 packet
= (struct pm4_map_process
*)buffer
;
156 pr_debug("kfd: In func %s\n", __func__
);
158 memset(buffer
, 0, sizeof(struct pm4_map_process
));
160 packet
->header
.u32all
= build_pm4_header(IT_MAP_PROCESS
,
161 sizeof(struct pm4_map_process
));
162 packet
->bitfields2
.diq_enable
= (qpd
->is_debug
) ? 1 : 0;
163 packet
->bitfields2
.process_quantum
= 1;
164 packet
->bitfields2
.pasid
= qpd
->pqm
->process
->pasid
;
165 packet
->bitfields3
.page_table_base
= qpd
->page_table_base
;
166 packet
->bitfields10
.gds_size
= qpd
->gds_size
;
167 packet
->bitfields10
.num_gws
= qpd
->num_gws
;
168 packet
->bitfields10
.num_oac
= qpd
->num_oac
;
170 list_for_each_entry(cur
, &qpd
->queues_list
, list
)
172 packet
->bitfields10
.num_queues
= (qpd
->is_debug
) ? 0 : num_queues
;
174 packet
->sh_mem_config
= qpd
->sh_mem_config
;
175 packet
->sh_mem_bases
= qpd
->sh_mem_bases
;
176 packet
->sh_mem_ape1_base
= qpd
->sh_mem_ape1_base
;
177 packet
->sh_mem_ape1_limit
= qpd
->sh_mem_ape1_limit
;
179 packet
->gds_addr_lo
= lower_32_bits(qpd
->gds_context_area
);
180 packet
->gds_addr_hi
= upper_32_bits(qpd
->gds_context_area
);
185 static int pm_create_map_queue_vi(struct packet_manager
*pm
, uint32_t *buffer
,
186 struct queue
*q
, bool is_static
)
188 struct pm4_mes_map_queues
*packet
;
189 bool use_static
= is_static
;
191 BUG_ON(!pm
|| !buffer
|| !q
);
193 pr_debug("kfd: In func %s\n", __func__
);
195 packet
= (struct pm4_mes_map_queues
*)buffer
;
196 memset(buffer
, 0, sizeof(struct pm4_map_queues
));
198 packet
->header
.u32all
= build_pm4_header(IT_MAP_QUEUES
,
199 sizeof(struct pm4_map_queues
));
200 packet
->bitfields2
.alloc_format
=
201 alloc_format__mes_map_queues__one_per_pipe_vi
;
202 packet
->bitfields2
.num_queues
= 1;
203 packet
->bitfields2
.queue_sel
=
204 queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi
;
206 packet
->bitfields2
.engine_sel
=
207 engine_sel__mes_map_queues__compute_vi
;
208 packet
->bitfields2
.queue_type
=
209 queue_type__mes_map_queues__normal_compute_vi
;
211 switch (q
->properties
.type
) {
212 case KFD_QUEUE_TYPE_COMPUTE
:
214 packet
->bitfields2
.queue_type
=
215 queue_type__mes_map_queues__normal_latency_static_queue_vi
;
217 case KFD_QUEUE_TYPE_DIQ
:
218 packet
->bitfields2
.queue_type
=
219 queue_type__mes_map_queues__debug_interface_queue_vi
;
221 case KFD_QUEUE_TYPE_SDMA
:
222 packet
->bitfields2
.engine_sel
=
223 engine_sel__mes_map_queues__sdma0_vi
;
224 use_static
= false; /* no static queues under SDMA */
227 pr_err("kfd: in %s queue type %d\n", __func__
,
232 packet
->bitfields3
.doorbell_offset
=
233 q
->properties
.doorbell_off
;
235 packet
->mqd_addr_lo
=
236 lower_32_bits(q
->gart_mqd_addr
);
238 packet
->mqd_addr_hi
=
239 upper_32_bits(q
->gart_mqd_addr
);
241 packet
->wptr_addr_lo
=
242 lower_32_bits((uint64_t)q
->properties
.write_ptr
);
244 packet
->wptr_addr_hi
=
245 upper_32_bits((uint64_t)q
->properties
.write_ptr
);
250 static int pm_create_map_queue(struct packet_manager
*pm
, uint32_t *buffer
,
251 struct queue
*q
, bool is_static
)
253 struct pm4_map_queues
*packet
;
254 bool use_static
= is_static
;
256 BUG_ON(!pm
|| !buffer
|| !q
);
258 pr_debug("kfd: In func %s\n", __func__
);
260 packet
= (struct pm4_map_queues
*)buffer
;
261 memset(buffer
, 0, sizeof(struct pm4_map_queues
));
263 packet
->header
.u32all
= build_pm4_header(IT_MAP_QUEUES
,
264 sizeof(struct pm4_map_queues
));
265 packet
->bitfields2
.alloc_format
=
266 alloc_format__mes_map_queues__one_per_pipe
;
267 packet
->bitfields2
.num_queues
= 1;
268 packet
->bitfields2
.queue_sel
=
269 queue_sel__mes_map_queues__map_to_hws_determined_queue_slots
;
271 packet
->bitfields2
.vidmem
= (q
->properties
.is_interop
) ?
272 vidmem__mes_map_queues__uses_video_memory
:
273 vidmem__mes_map_queues__uses_no_video_memory
;
275 switch (q
->properties
.type
) {
276 case KFD_QUEUE_TYPE_COMPUTE
:
277 case KFD_QUEUE_TYPE_DIQ
:
278 packet
->bitfields2
.engine_sel
=
279 engine_sel__mes_map_queues__compute
;
281 case KFD_QUEUE_TYPE_SDMA
:
282 packet
->bitfields2
.engine_sel
=
283 engine_sel__mes_map_queues__sdma0
;
284 use_static
= false; /* no static queues under SDMA */
291 packet
->mes_map_queues_ordinals
[0].bitfields3
.doorbell_offset
=
292 q
->properties
.doorbell_off
;
294 packet
->mes_map_queues_ordinals
[0].bitfields3
.is_static
=
295 (use_static
== true) ? 1 : 0;
297 packet
->mes_map_queues_ordinals
[0].mqd_addr_lo
=
298 lower_32_bits(q
->gart_mqd_addr
);
300 packet
->mes_map_queues_ordinals
[0].mqd_addr_hi
=
301 upper_32_bits(q
->gart_mqd_addr
);
303 packet
->mes_map_queues_ordinals
[0].wptr_addr_lo
=
304 lower_32_bits((uint64_t)q
->properties
.write_ptr
);
306 packet
->mes_map_queues_ordinals
[0].wptr_addr_hi
=
307 upper_32_bits((uint64_t)q
->properties
.write_ptr
);
312 static int pm_create_runlist_ib(struct packet_manager
*pm
,
313 struct list_head
*queues
,
314 uint64_t *rl_gpu_addr
,
315 size_t *rl_size_bytes
)
317 unsigned int alloc_size_bytes
;
318 unsigned int *rl_buffer
, rl_wptr
, i
;
319 int retval
, proccesses_mapped
;
320 struct device_process_node
*cur
;
321 struct qcm_process_device
*qpd
;
323 struct kernel_queue
*kq
;
324 bool is_over_subscription
;
326 BUG_ON(!pm
|| !queues
|| !rl_size_bytes
|| !rl_gpu_addr
);
328 rl_wptr
= retval
= proccesses_mapped
= 0;
330 retval
= pm_allocate_runlist_ib(pm
, &rl_buffer
, rl_gpu_addr
,
331 &alloc_size_bytes
, &is_over_subscription
);
335 *rl_size_bytes
= alloc_size_bytes
;
337 pr_debug("kfd: In func %s\n", __func__
);
338 pr_debug("kfd: building runlist ib process count: %d queues count %d\n",
339 pm
->dqm
->processes_count
, pm
->dqm
->queue_count
);
341 /* build the run list ib packet */
342 list_for_each_entry(cur
, queues
, list
) {
344 /* build map process packet */
345 if (proccesses_mapped
>= pm
->dqm
->processes_count
) {
346 pr_debug("kfd: not enough space left in runlist IB\n");
351 retval
= pm_create_map_process(pm
, &rl_buffer
[rl_wptr
], qpd
);
356 inc_wptr(&rl_wptr
, sizeof(struct pm4_map_process
),
359 list_for_each_entry(kq
, &qpd
->priv_queue_list
, list
) {
360 if (kq
->queue
->properties
.is_active
!= true)
363 pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n",
364 kq
->queue
->queue
, qpd
->is_debug
);
366 if (pm
->dqm
->dev
->device_info
->asic_family
==
368 retval
= pm_create_map_queue_vi(pm
,
373 retval
= pm_create_map_queue(pm
,
381 sizeof(struct pm4_map_queues
),
385 list_for_each_entry(q
, &qpd
->queues_list
, list
) {
386 if (q
->properties
.is_active
!= true)
389 pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n",
390 q
->queue
, qpd
->is_debug
);
392 if (pm
->dqm
->dev
->device_info
->asic_family
==
394 retval
= pm_create_map_queue_vi(pm
,
399 retval
= pm_create_map_queue(pm
,
408 sizeof(struct pm4_map_queues
),
413 pr_debug("kfd: finished map process and queues to runlist\n");
415 if (is_over_subscription
)
416 pm_create_runlist(pm
, &rl_buffer
[rl_wptr
], *rl_gpu_addr
,
417 alloc_size_bytes
/ sizeof(uint32_t), true);
419 for (i
= 0; i
< alloc_size_bytes
/ sizeof(uint32_t); i
++)
420 pr_debug("0x%2X ", rl_buffer
[i
]);
426 int pm_init(struct packet_manager
*pm
, struct device_queue_manager
*dqm
)
431 mutex_init(&pm
->lock
);
432 pm
->priv_queue
= kernel_queue_init(dqm
->dev
, KFD_QUEUE_TYPE_HIQ
);
433 if (pm
->priv_queue
== NULL
) {
434 mutex_destroy(&pm
->lock
);
437 pm
->allocated
= false;
442 void pm_uninit(struct packet_manager
*pm
)
446 mutex_destroy(&pm
->lock
);
447 kernel_queue_uninit(pm
->priv_queue
);
450 int pm_send_set_resources(struct packet_manager
*pm
,
451 struct scheduling_resources
*res
)
453 struct pm4_set_resources
*packet
;
457 pr_debug("kfd: In func %s\n", __func__
);
459 mutex_lock(&pm
->lock
);
460 pm
->priv_queue
->ops
.acquire_packet_buffer(pm
->priv_queue
,
461 sizeof(*packet
) / sizeof(uint32_t),
462 (unsigned int **)&packet
);
463 if (packet
== NULL
) {
464 mutex_unlock(&pm
->lock
);
465 pr_err("kfd: failed to allocate buffer on kernel queue\n");
469 memset(packet
, 0, sizeof(struct pm4_set_resources
));
470 packet
->header
.u32all
= build_pm4_header(IT_SET_RESOURCES
,
471 sizeof(struct pm4_set_resources
));
473 packet
->bitfields2
.queue_type
=
474 queue_type__mes_set_resources__hsa_interface_queue_hiq
;
475 packet
->bitfields2
.vmid_mask
= res
->vmid_mask
;
476 packet
->bitfields2
.unmap_latency
= KFD_UNMAP_LATENCY
;
477 packet
->bitfields7
.oac_mask
= res
->oac_mask
;
478 packet
->bitfields8
.gds_heap_base
= res
->gds_heap_base
;
479 packet
->bitfields8
.gds_heap_size
= res
->gds_heap_size
;
481 packet
->gws_mask_lo
= lower_32_bits(res
->gws_mask
);
482 packet
->gws_mask_hi
= upper_32_bits(res
->gws_mask
);
484 packet
->queue_mask_lo
= lower_32_bits(res
->queue_mask
);
485 packet
->queue_mask_hi
= upper_32_bits(res
->queue_mask
);
487 pm
->priv_queue
->ops
.submit_packet(pm
->priv_queue
);
489 mutex_unlock(&pm
->lock
);
494 int pm_send_runlist(struct packet_manager
*pm
, struct list_head
*dqm_queues
)
496 uint64_t rl_gpu_ib_addr
;
498 size_t rl_ib_size
, packet_size_dwords
;
501 BUG_ON(!pm
|| !dqm_queues
);
503 retval
= pm_create_runlist_ib(pm
, dqm_queues
, &rl_gpu_ib_addr
,
506 goto fail_create_runlist_ib
;
508 pr_debug("kfd: runlist IB address: 0x%llX\n", rl_gpu_ib_addr
);
510 packet_size_dwords
= sizeof(struct pm4_runlist
) / sizeof(uint32_t);
511 mutex_lock(&pm
->lock
);
513 retval
= pm
->priv_queue
->ops
.acquire_packet_buffer(pm
->priv_queue
,
514 packet_size_dwords
, &rl_buffer
);
516 goto fail_acquire_packet_buffer
;
518 retval
= pm_create_runlist(pm
, rl_buffer
, rl_gpu_ib_addr
,
519 rl_ib_size
/ sizeof(uint32_t), false);
521 goto fail_create_runlist
;
523 pm
->priv_queue
->ops
.submit_packet(pm
->priv_queue
);
525 mutex_unlock(&pm
->lock
);
530 pm
->priv_queue
->ops
.rollback_packet(pm
->priv_queue
);
531 fail_acquire_packet_buffer
:
532 mutex_unlock(&pm
->lock
);
533 fail_create_runlist_ib
:
534 if (pm
->allocated
== true)
539 int pm_send_query_status(struct packet_manager
*pm
, uint64_t fence_address
,
540 uint32_t fence_value
)
543 struct pm4_query_status
*packet
;
545 BUG_ON(!pm
|| !fence_address
);
547 mutex_lock(&pm
->lock
);
548 retval
= pm
->priv_queue
->ops
.acquire_packet_buffer(
550 sizeof(struct pm4_query_status
) / sizeof(uint32_t),
551 (unsigned int **)&packet
);
553 goto fail_acquire_packet_buffer
;
555 packet
->header
.u32all
= build_pm4_header(IT_QUERY_STATUS
,
556 sizeof(struct pm4_query_status
));
558 packet
->bitfields2
.context_id
= 0;
559 packet
->bitfields2
.interrupt_sel
=
560 interrupt_sel__mes_query_status__completion_status
;
561 packet
->bitfields2
.command
=
562 command__mes_query_status__fence_only_after_write_ack
;
564 packet
->addr_hi
= upper_32_bits((uint64_t)fence_address
);
565 packet
->addr_lo
= lower_32_bits((uint64_t)fence_address
);
566 packet
->data_hi
= upper_32_bits((uint64_t)fence_value
);
567 packet
->data_lo
= lower_32_bits((uint64_t)fence_value
);
569 pm
->priv_queue
->ops
.submit_packet(pm
->priv_queue
);
570 mutex_unlock(&pm
->lock
);
574 fail_acquire_packet_buffer
:
575 mutex_unlock(&pm
->lock
);
579 int pm_send_unmap_queue(struct packet_manager
*pm
, enum kfd_queue_type type
,
580 enum kfd_preempt_type_filter mode
,
581 uint32_t filter_param
, bool reset
,
582 unsigned int sdma_engine
)
586 struct pm4_unmap_queues
*packet
;
590 mutex_lock(&pm
->lock
);
591 retval
= pm
->priv_queue
->ops
.acquire_packet_buffer(
593 sizeof(struct pm4_unmap_queues
) / sizeof(uint32_t),
596 goto err_acquire_packet_buffer
;
598 packet
= (struct pm4_unmap_queues
*)buffer
;
599 memset(buffer
, 0, sizeof(struct pm4_unmap_queues
));
600 pr_debug("kfd: static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n",
602 packet
->header
.u32all
= build_pm4_header(IT_UNMAP_QUEUES
,
603 sizeof(struct pm4_unmap_queues
));
605 case KFD_QUEUE_TYPE_COMPUTE
:
606 case KFD_QUEUE_TYPE_DIQ
:
607 packet
->bitfields2
.engine_sel
=
608 engine_sel__mes_unmap_queues__compute
;
610 case KFD_QUEUE_TYPE_SDMA
:
611 packet
->bitfields2
.engine_sel
=
612 engine_sel__mes_unmap_queues__sdma0
+ sdma_engine
;
620 packet
->bitfields2
.action
=
621 action__mes_unmap_queues__reset_queues
;
623 packet
->bitfields2
.action
=
624 action__mes_unmap_queues__preempt_queues
;
627 case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE
:
628 packet
->bitfields2
.queue_sel
=
629 queue_sel__mes_unmap_queues__perform_request_on_specified_queues
;
630 packet
->bitfields2
.num_queues
= 1;
631 packet
->bitfields3b
.doorbell_offset0
= filter_param
;
633 case KFD_PREEMPT_TYPE_FILTER_BY_PASID
:
634 packet
->bitfields2
.queue_sel
=
635 queue_sel__mes_unmap_queues__perform_request_on_pasid_queues
;
636 packet
->bitfields3a
.pasid
= filter_param
;
638 case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES
:
639 packet
->bitfields2
.queue_sel
=
640 queue_sel__mes_unmap_queues__perform_request_on_all_active_queues
;
642 case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES
:
643 /* in this case, we do not preempt static queues */
644 packet
->bitfields2
.queue_sel
=
645 queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only
;
652 pm
->priv_queue
->ops
.submit_packet(pm
->priv_queue
);
654 mutex_unlock(&pm
->lock
);
657 err_acquire_packet_buffer
:
658 mutex_unlock(&pm
->lock
);
662 void pm_release_ib(struct packet_manager
*pm
)
666 mutex_lock(&pm
->lock
);
668 kfd_gtt_sa_free(pm
->dqm
->dev
, pm
->ib_buffer_obj
);
669 pm
->allocated
= false;
671 mutex_unlock(&pm
->lock
);