Merge tag 'pci-v5.18-changes-2' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / drivers / misc / habanalabs / common / command_submission.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * Copyright 2016-2021 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  */
7
8 #include <uapi/misc/habanalabs.h>
9 #include "habanalabs.h"
10
11 #include <linux/uaccess.h>
12 #include <linux/slab.h>
13
14 #define HL_CS_FLAGS_TYPE_MASK   (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
15                                 HL_CS_FLAGS_COLLECTIVE_WAIT)
16
17 #define MAX_TS_ITER_NUM 10
18
19 /**
20  * enum hl_cs_wait_status - cs wait status
21  * @CS_WAIT_STATUS_BUSY: cs was not completed yet
22  * @CS_WAIT_STATUS_COMPLETED: cs completed
23  * @CS_WAIT_STATUS_GONE: cs completed but fence is already gone
24  */
25 enum hl_cs_wait_status {
26         CS_WAIT_STATUS_BUSY,
27         CS_WAIT_STATUS_COMPLETED,
28         CS_WAIT_STATUS_GONE
29 };
30
31 static void job_wq_completion(struct work_struct *work);
32 static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
33                                 u64 timeout_us, u64 seq,
34                                 enum hl_cs_wait_status *status, s64 *timestamp);
35 static void cs_do_release(struct kref *ref);
36
37 static void hl_sob_reset(struct kref *ref)
38 {
39         struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
40                                                         kref);
41         struct hl_device *hdev = hw_sob->hdev;
42
43         dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id);
44
45         hdev->asic_funcs->reset_sob(hdev, hw_sob);
46
47         hw_sob->need_reset = false;
48 }
49
50 void hl_sob_reset_error(struct kref *ref)
51 {
52         struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
53                                                         kref);
54         struct hl_device *hdev = hw_sob->hdev;
55
56         dev_crit(hdev->dev,
57                 "SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n",
58                 hw_sob->q_idx, hw_sob->sob_id);
59 }
60
61 void hw_sob_put(struct hl_hw_sob *hw_sob)
62 {
63         if (hw_sob)
64                 kref_put(&hw_sob->kref, hl_sob_reset);
65 }
66
67 static void hw_sob_put_err(struct hl_hw_sob *hw_sob)
68 {
69         if (hw_sob)
70                 kref_put(&hw_sob->kref, hl_sob_reset_error);
71 }
72
73 void hw_sob_get(struct hl_hw_sob *hw_sob)
74 {
75         if (hw_sob)
76                 kref_get(&hw_sob->kref);
77 }
78
79 /**
80  * hl_gen_sob_mask() - Generates a sob mask to be used in a monitor arm packet
81  * @sob_base: sob base id
82  * @sob_mask: sob user mask, each bit represents a sob offset from sob base
83  * @mask: generated mask
84  *
85  * Return: 0 if given parameters are valid
86  */
87 int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask)
88 {
89         int i;
90
91         if (sob_mask == 0)
92                 return -EINVAL;
93
94         if (sob_mask == 0x1) {
95                 *mask = ~(1 << (sob_base & 0x7));
96         } else {
97                 /* find msb in order to verify sob range is valid */
98                 for (i = BITS_PER_BYTE - 1 ; i >= 0 ; i--)
99                         if (BIT(i) & sob_mask)
100                                 break;
101
102                 if (i > (HL_MAX_SOBS_PER_MONITOR - (sob_base & 0x7) - 1))
103                         return -EINVAL;
104
105                 *mask = ~sob_mask;
106         }
107
108         return 0;
109 }
110
111 static void hl_fence_release(struct kref *kref)
112 {
113         struct hl_fence *fence =
114                 container_of(kref, struct hl_fence, refcount);
115         struct hl_cs_compl *hl_cs_cmpl =
116                 container_of(fence, struct hl_cs_compl, base_fence);
117
118         kfree(hl_cs_cmpl);
119 }
120
121 void hl_fence_put(struct hl_fence *fence)
122 {
123         if (IS_ERR_OR_NULL(fence))
124                 return;
125         kref_put(&fence->refcount, hl_fence_release);
126 }
127
128 void hl_fences_put(struct hl_fence **fence, int len)
129 {
130         int i;
131
132         for (i = 0; i < len; i++, fence++)
133                 hl_fence_put(*fence);
134 }
135
136 void hl_fence_get(struct hl_fence *fence)
137 {
138         if (fence)
139                 kref_get(&fence->refcount);
140 }
141
142 static void hl_fence_init(struct hl_fence *fence, u64 sequence)
143 {
144         kref_init(&fence->refcount);
145         fence->cs_sequence = sequence;
146         fence->error = 0;
147         fence->timestamp = ktime_set(0, 0);
148         fence->mcs_handling_done = false;
149         init_completion(&fence->completion);
150 }
151
152 void cs_get(struct hl_cs *cs)
153 {
154         kref_get(&cs->refcount);
155 }
156
157 static int cs_get_unless_zero(struct hl_cs *cs)
158 {
159         return kref_get_unless_zero(&cs->refcount);
160 }
161
162 static void cs_put(struct hl_cs *cs)
163 {
164         kref_put(&cs->refcount, cs_do_release);
165 }
166
167 static void cs_job_do_release(struct kref *ref)
168 {
169         struct hl_cs_job *job = container_of(ref, struct hl_cs_job, refcount);
170
171         kfree(job);
172 }
173
174 static void cs_job_put(struct hl_cs_job *job)
175 {
176         kref_put(&job->refcount, cs_job_do_release);
177 }
178
179 bool cs_needs_completion(struct hl_cs *cs)
180 {
181         /* In case this is a staged CS, only the last CS in sequence should
182          * get a completion, any non staged CS will always get a completion
183          */
184         if (cs->staged_cs && !cs->staged_last)
185                 return false;
186
187         return true;
188 }
189
190 bool cs_needs_timeout(struct hl_cs *cs)
191 {
192         /* In case this is a staged CS, only the first CS in sequence should
193          * get a timeout, any non staged CS will always get a timeout
194          */
195         if (cs->staged_cs && !cs->staged_first)
196                 return false;
197
198         return true;
199 }
200
201 static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
202 {
203         /*
204          * Patched CB is created for external queues jobs, and for H/W queues
205          * jobs if the user CB was allocated by driver and MMU is disabled.
206          */
207         return (job->queue_type == QUEUE_TYPE_EXT ||
208                         (job->queue_type == QUEUE_TYPE_HW &&
209                                         job->is_kernel_allocated_cb &&
210                                         !hdev->mmu_enable));
211 }
212
213 /*
214  * cs_parser - parse the user command submission
215  *
216  * @hpriv       : pointer to the private data of the fd
217  * @job        : pointer to the job that holds the command submission info
218  *
219  * The function parses the command submission of the user. It calls the
220  * ASIC specific parser, which returns a list of memory blocks to send
221  * to the device as different command buffers
222  *
223  */
224 static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
225 {
226         struct hl_device *hdev = hpriv->hdev;
227         struct hl_cs_parser parser;
228         int rc;
229
230         parser.ctx_id = job->cs->ctx->asid;
231         parser.cs_sequence = job->cs->sequence;
232         parser.job_id = job->id;
233
234         parser.hw_queue_id = job->hw_queue_id;
235         parser.job_userptr_list = &job->userptr_list;
236         parser.patched_cb = NULL;
237         parser.user_cb = job->user_cb;
238         parser.user_cb_size = job->user_cb_size;
239         parser.queue_type = job->queue_type;
240         parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
241         job->patched_cb = NULL;
242         parser.completion = cs_needs_completion(job->cs);
243
244         rc = hdev->asic_funcs->cs_parser(hdev, &parser);
245
246         if (is_cb_patched(hdev, job)) {
247                 if (!rc) {
248                         job->patched_cb = parser.patched_cb;
249                         job->job_cb_size = parser.patched_cb_size;
250                         job->contains_dma_pkt = parser.contains_dma_pkt;
251                         atomic_inc(&job->patched_cb->cs_cnt);
252                 }
253
254                 /*
255                  * Whether the parsing worked or not, we don't need the
256                  * original CB anymore because it was already parsed and
257                  * won't be accessed again for this CS
258                  */
259                 atomic_dec(&job->user_cb->cs_cnt);
260                 hl_cb_put(job->user_cb);
261                 job->user_cb = NULL;
262         } else if (!rc) {
263                 job->job_cb_size = job->user_cb_size;
264         }
265
266         return rc;
267 }
268
269 static void complete_job(struct hl_device *hdev, struct hl_cs_job *job)
270 {
271         struct hl_cs *cs = job->cs;
272
273         if (is_cb_patched(hdev, job)) {
274                 hl_userptr_delete_list(hdev, &job->userptr_list);
275
276                 /*
277                  * We might arrive here from rollback and patched CB wasn't
278                  * created, so we need to check it's not NULL
279                  */
280                 if (job->patched_cb) {
281                         atomic_dec(&job->patched_cb->cs_cnt);
282                         hl_cb_put(job->patched_cb);
283                 }
284         }
285
286         /* For H/W queue jobs, if a user CB was allocated by driver and MMU is
287          * enabled, the user CB isn't released in cs_parser() and thus should be
288          * released here.
289          * This is also true for INT queues jobs which were allocated by driver
290          */
291         if (job->is_kernel_allocated_cb &&
292                 ((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) ||
293                                 job->queue_type == QUEUE_TYPE_INT)) {
294                 atomic_dec(&job->user_cb->cs_cnt);
295                 hl_cb_put(job->user_cb);
296         }
297
298         /*
299          * This is the only place where there can be multiple threads
300          * modifying the list at the same time
301          */
302         spin_lock(&cs->job_lock);
303         list_del(&job->cs_node);
304         spin_unlock(&cs->job_lock);
305
306         hl_debugfs_remove_job(hdev, job);
307
308         /* We decrement reference only for a CS that gets completion
309          * because the reference was incremented only for this kind of CS
310          * right before it was scheduled.
311          *
312          * In staged submission, only the last CS marked as 'staged_last'
313          * gets completion, hence its release function will be called from here.
314          * As for all the rest CS's in the staged submission which do not get
315          * completion, their CS reference will be decremented by the
316          * 'staged_last' CS during the CS release flow.
317          * All relevant PQ CI counters will be incremented during the CS release
318          * flow by calling 'hl_hw_queue_update_ci'.
319          */
320         if (cs_needs_completion(cs) &&
321                 (job->queue_type == QUEUE_TYPE_EXT ||
322                         job->queue_type == QUEUE_TYPE_HW))
323                 cs_put(cs);
324
325         cs_job_put(job);
326 }
327
328 /*
329  * hl_staged_cs_find_first - locate the first CS in this staged submission
330  *
331  * @hdev: pointer to device structure
332  * @cs_seq: staged submission sequence number
333  *
334  * @note: This function must be called under 'hdev->cs_mirror_lock'
335  *
336  * Find and return a CS pointer with the given sequence
337  */
338 struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq)
339 {
340         struct hl_cs *cs;
341
342         list_for_each_entry_reverse(cs, &hdev->cs_mirror_list, mirror_node)
343                 if (cs->staged_cs && cs->staged_first &&
344                                 cs->sequence == cs_seq)
345                         return cs;
346
347         return NULL;
348 }
349
350 /*
351  * is_staged_cs_last_exists - returns true if the last CS in sequence exists
352  *
353  * @hdev: pointer to device structure
354  * @cs: staged submission member
355  *
356  */
357 bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs)
358 {
359         struct hl_cs *last_entry;
360
361         last_entry = list_last_entry(&cs->staged_cs_node, struct hl_cs,
362                                                                 staged_cs_node);
363
364         if (last_entry->staged_last)
365                 return true;
366
367         return false;
368 }
369
370 /*
371  * staged_cs_get - get CS reference if this CS is a part of a staged CS
372  *
373  * @hdev: pointer to device structure
374  * @cs: current CS
375  * @cs_seq: staged submission sequence number
376  *
377  * Increment CS reference for every CS in this staged submission except for
378  * the CS which get completion.
379  */
380 static void staged_cs_get(struct hl_device *hdev, struct hl_cs *cs)
381 {
382         /* Only the last CS in this staged submission will get a completion.
383          * We must increment the reference for all other CS's in this
384          * staged submission.
385          * Once we get a completion we will release the whole staged submission.
386          */
387         if (!cs->staged_last)
388                 cs_get(cs);
389 }
390
391 /*
392  * staged_cs_put - put a CS in case it is part of staged submission
393  *
394  * @hdev: pointer to device structure
395  * @cs: CS to put
396  *
397  * This function decrements a CS reference (for a non completion CS)
398  */
399 static void staged_cs_put(struct hl_device *hdev, struct hl_cs *cs)
400 {
401         /* We release all CS's in a staged submission except the last
402          * CS which we have never incremented its reference.
403          */
404         if (!cs_needs_completion(cs))
405                 cs_put(cs);
406 }
407
408 static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
409 {
410         bool next_entry_found = false;
411         struct hl_cs *next, *first_cs;
412
413         if (!cs_needs_timeout(cs))
414                 return;
415
416         spin_lock(&hdev->cs_mirror_lock);
417
418         /* We need to handle tdr only once for the complete staged submission.
419          * Hence, we choose the CS that reaches this function first which is
420          * the CS marked as 'staged_last'.
421          * In case single staged cs was submitted which has both first and last
422          * indications, then "cs_find_first" below will return NULL, since we
423          * removed the cs node from the list before getting here,
424          * in such cases just continue with the cs to cancel it's TDR work.
425          */
426         if (cs->staged_cs && cs->staged_last) {
427                 first_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence);
428                 if (first_cs)
429                         cs = first_cs;
430         }
431
432         spin_unlock(&hdev->cs_mirror_lock);
433
434         /* Don't cancel TDR in case this CS was timedout because we might be
435          * running from the TDR context
436          */
437         if (cs->timedout || hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT)
438                 return;
439
440         if (cs->tdr_active)
441                 cancel_delayed_work_sync(&cs->work_tdr);
442
443         spin_lock(&hdev->cs_mirror_lock);
444
445         /* queue TDR for next CS */
446         list_for_each_entry(next, &hdev->cs_mirror_list, mirror_node)
447                 if (cs_needs_timeout(next)) {
448                         next_entry_found = true;
449                         break;
450                 }
451
452         if (next_entry_found && !next->tdr_active) {
453                 next->tdr_active = true;
454                 schedule_delayed_work(&next->work_tdr, next->timeout_jiffies);
455         }
456
457         spin_unlock(&hdev->cs_mirror_lock);
458 }
459
460 /*
461  * force_complete_multi_cs - complete all contexts that wait on multi-CS
462  *
463  * @hdev: pointer to habanalabs device structure
464  */
465 static void force_complete_multi_cs(struct hl_device *hdev)
466 {
467         int i;
468
469         for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
470                 struct multi_cs_completion *mcs_compl;
471
472                 mcs_compl = &hdev->multi_cs_completion[i];
473
474                 spin_lock(&mcs_compl->lock);
475
476                 if (!mcs_compl->used) {
477                         spin_unlock(&mcs_compl->lock);
478                         continue;
479                 }
480
481                 /* when calling force complete no context should be waiting on
482                  * multi-cS.
483                  * We are calling the function as a protection for such case
484                  * to free any pending context and print error message
485                  */
486                 dev_err(hdev->dev,
487                                 "multi-CS completion context %d still waiting when calling force completion\n",
488                                 i);
489                 complete_all(&mcs_compl->completion);
490                 spin_unlock(&mcs_compl->lock);
491         }
492 }
493
494 /*
495  * complete_multi_cs - complete all waiting entities on multi-CS
496  *
497  * @hdev: pointer to habanalabs device structure
498  * @cs: CS structure
499  * The function signals a waiting entity that has an overlapping stream masters
500  * with the completed CS.
501  * For example:
502  * - a completed CS worked on stream master QID 4, multi CS completion
503  *   is actively waiting on stream master QIDs 3, 5. don't send signal as no
504  *   common stream master QID
505  * - a completed CS worked on stream master QID 4, multi CS completion
506  *   is actively waiting on stream master QIDs 3, 4. send signal as stream
507  *   master QID 4 is common
508  */
509 static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
510 {
511         struct hl_fence *fence = cs->fence;
512         int i;
513
514         /* in case of multi CS check for completion only for the first CS */
515         if (cs->staged_cs && !cs->staged_first)
516                 return;
517
518         for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
519                 struct multi_cs_completion *mcs_compl;
520
521                 mcs_compl = &hdev->multi_cs_completion[i];
522                 if (!mcs_compl->used)
523                         continue;
524
525                 spin_lock(&mcs_compl->lock);
526
527                 /*
528                  * complete if:
529                  * 1. still waiting for completion
530                  * 2. the completed CS has at least one overlapping stream
531                  *    master with the stream masters in the completion
532                  */
533                 if (mcs_compl->used &&
534                                 (fence->stream_master_qid_map &
535                                         mcs_compl->stream_master_qid_map)) {
536                         /* extract the timestamp only of first completed CS */
537                         if (!mcs_compl->timestamp)
538                                 mcs_compl->timestamp = ktime_to_ns(fence->timestamp);
539
540                         complete_all(&mcs_compl->completion);
541
542                         /*
543                          * Setting mcs_handling_done inside the lock ensures
544                          * at least one fence have mcs_handling_done set to
545                          * true before wait for mcs finish. This ensures at
546                          * least one CS will be set as completed when polling
547                          * mcs fences.
548                          */
549                         fence->mcs_handling_done = true;
550                 }
551
552                 spin_unlock(&mcs_compl->lock);
553         }
554         /* In case CS completed without mcs completion initialized */
555         fence->mcs_handling_done = true;
556 }
557
558 static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
559                                         struct hl_cs *cs,
560                                         struct hl_cs_compl *hl_cs_cmpl)
561 {
562         /* Skip this handler if the cs wasn't submitted, to avoid putting
563          * the hw_sob twice, since this case already handled at this point,
564          * also skip if the hw_sob pointer wasn't set.
565          */
566         if (!hl_cs_cmpl->hw_sob || !cs->submitted)
567                 return;
568
569         spin_lock(&hl_cs_cmpl->lock);
570
571         /*
572          * we get refcount upon reservation of signals or signal/wait cs for the
573          * hw_sob object, and need to put it when the first staged cs
574          * (which cotains the encaps signals) or cs signal/wait is completed.
575          */
576         if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
577                         (hl_cs_cmpl->type == CS_TYPE_WAIT) ||
578                         (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) ||
579                         (!!hl_cs_cmpl->encaps_signals)) {
580                 dev_dbg(hdev->dev,
581                                 "CS 0x%llx type %d finished, sob_id: %d, sob_val: %u\n",
582                                 hl_cs_cmpl->cs_seq,
583                                 hl_cs_cmpl->type,
584                                 hl_cs_cmpl->hw_sob->sob_id,
585                                 hl_cs_cmpl->sob_val);
586
587                 hw_sob_put(hl_cs_cmpl->hw_sob);
588
589                 if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
590                         hdev->asic_funcs->reset_sob_group(hdev,
591                                         hl_cs_cmpl->sob_group);
592         }
593
594         spin_unlock(&hl_cs_cmpl->lock);
595 }
596
597 static void cs_do_release(struct kref *ref)
598 {
599         struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
600         struct hl_device *hdev = cs->ctx->hdev;
601         struct hl_cs_job *job, *tmp;
602         struct hl_cs_compl *hl_cs_cmpl =
603                         container_of(cs->fence, struct hl_cs_compl, base_fence);
604
605         cs->completed = true;
606
607         /*
608          * Although if we reached here it means that all external jobs have
609          * finished, because each one of them took refcnt to CS, we still
610          * need to go over the internal jobs and complete them. Otherwise, we
611          * will have leaked memory and what's worse, the CS object (and
612          * potentially the CTX object) could be released, while the JOB
613          * still holds a pointer to them (but no reference).
614          */
615         list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
616                 complete_job(hdev, job);
617
618         if (!cs->submitted) {
619                 /*
620                  * In case the wait for signal CS was submitted, the fence put
621                  * occurs in init_signal_wait_cs() or collective_wait_init_cs()
622                  * right before hanging on the PQ.
623                  */
624                 if (cs->type == CS_TYPE_WAIT ||
625                                 cs->type == CS_TYPE_COLLECTIVE_WAIT)
626                         hl_fence_put(cs->signal_fence);
627
628                 goto out;
629         }
630
631         /* Need to update CI for all queue jobs that does not get completion */
632         hl_hw_queue_update_ci(cs);
633
634         /* remove CS from CS mirror list */
635         spin_lock(&hdev->cs_mirror_lock);
636         list_del_init(&cs->mirror_node);
637         spin_unlock(&hdev->cs_mirror_lock);
638
639         cs_handle_tdr(hdev, cs);
640
641         if (cs->staged_cs) {
642                 /* the completion CS decrements reference for the entire
643                  * staged submission
644                  */
645                 if (cs->staged_last) {
646                         struct hl_cs *staged_cs, *tmp;
647
648                         list_for_each_entry_safe(staged_cs, tmp,
649                                         &cs->staged_cs_node, staged_cs_node)
650                                 staged_cs_put(hdev, staged_cs);
651                 }
652
653                 /* A staged CS will be a member in the list only after it
654                  * was submitted. We used 'cs_mirror_lock' when inserting
655                  * it to list so we will use it again when removing it
656                  */
657                 if (cs->submitted) {
658                         spin_lock(&hdev->cs_mirror_lock);
659                         list_del(&cs->staged_cs_node);
660                         spin_unlock(&hdev->cs_mirror_lock);
661                 }
662
663                 /* decrement refcount to handle when first staged cs
664                  * with encaps signals is completed.
665                  */
666                 if (hl_cs_cmpl->encaps_signals)
667                         kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
668                                                 hl_encaps_handle_do_release);
669         }
670
671         if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
672                         && cs->encaps_signals)
673                 kref_put(&cs->encaps_sig_hdl->refcount,
674                                         hl_encaps_handle_do_release);
675
676 out:
677         /* Must be called before hl_ctx_put because inside we use ctx to get
678          * the device
679          */
680         hl_debugfs_remove_cs(cs);
681
682         hl_ctx_put(cs->ctx);
683
684         /* We need to mark an error for not submitted because in that case
685          * the hl fence release flow is different. Mainly, we don't need
686          * to handle hw_sob for signal/wait
687          */
688         if (cs->timedout)
689                 cs->fence->error = -ETIMEDOUT;
690         else if (cs->aborted)
691                 cs->fence->error = -EIO;
692         else if (!cs->submitted)
693                 cs->fence->error = -EBUSY;
694
695         if (unlikely(cs->skip_reset_on_timeout)) {
696                 dev_err(hdev->dev,
697                         "Command submission %llu completed after %llu (s)\n",
698                         cs->sequence,
699                         div_u64(jiffies - cs->submission_time_jiffies, HZ));
700         }
701
702         if (cs->timestamp)
703                 cs->fence->timestamp = ktime_get();
704         complete_all(&cs->fence->completion);
705         complete_multi_cs(hdev, cs);
706
707         cs_release_sob_reset_handler(hdev, cs, hl_cs_cmpl);
708
709         hl_fence_put(cs->fence);
710
711         kfree(cs->jobs_in_queue_cnt);
712         kfree(cs);
713 }
714
715 static void cs_timedout(struct work_struct *work)
716 {
717         struct hl_device *hdev;
718         int rc;
719         struct hl_cs *cs = container_of(work, struct hl_cs,
720                                                  work_tdr.work);
721         bool skip_reset_on_timeout = cs->skip_reset_on_timeout;
722
723         rc = cs_get_unless_zero(cs);
724         if (!rc)
725                 return;
726
727         if ((!cs->submitted) || (cs->completed)) {
728                 cs_put(cs);
729                 return;
730         }
731
732         /* Mark the CS is timed out so we won't try to cancel its TDR */
733         if (likely(!skip_reset_on_timeout))
734                 cs->timedout = true;
735
736         hdev = cs->ctx->hdev;
737
738         /* Save only the first CS timeout parameters */
739         rc = atomic_cmpxchg(&hdev->last_error.cs_write_disable, 0, 1);
740         if (!rc) {
741                 hdev->last_error.open_dev_timestamp = hdev->last_successful_open_ktime;
742                 hdev->last_error.cs_timeout_timestamp = ktime_get();
743                 hdev->last_error.cs_timeout_seq = cs->sequence;
744         }
745
746         switch (cs->type) {
747         case CS_TYPE_SIGNAL:
748                 dev_err(hdev->dev,
749                         "Signal command submission %llu has not finished in time!\n",
750                         cs->sequence);
751                 break;
752
753         case CS_TYPE_WAIT:
754                 dev_err(hdev->dev,
755                         "Wait command submission %llu has not finished in time!\n",
756                         cs->sequence);
757                 break;
758
759         case CS_TYPE_COLLECTIVE_WAIT:
760                 dev_err(hdev->dev,
761                         "Collective Wait command submission %llu has not finished in time!\n",
762                         cs->sequence);
763                 break;
764
765         default:
766                 dev_err(hdev->dev,
767                         "Command submission %llu has not finished in time!\n",
768                         cs->sequence);
769                 break;
770         }
771
772         rc = hl_state_dump(hdev);
773         if (rc)
774                 dev_err(hdev->dev, "Error during system state dump %d\n", rc);
775
776         cs_put(cs);
777
778         if (likely(!skip_reset_on_timeout)) {
779                 if (hdev->reset_on_lockup)
780                         hl_device_reset(hdev, HL_DRV_RESET_TDR);
781                 else
782                         hdev->reset_info.needs_reset = true;
783         }
784 }
785
786 static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
787                         enum hl_cs_type cs_type, u64 user_sequence,
788                         struct hl_cs **cs_new, u32 flags, u32 timeout)
789 {
790         struct hl_cs_counters_atomic *cntr;
791         struct hl_fence *other = NULL;
792         struct hl_cs_compl *cs_cmpl;
793         struct hl_cs *cs;
794         int rc;
795
796         cntr = &hdev->aggregated_cs_counters;
797
798         cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
799         if (!cs)
800                 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
801
802         if (!cs) {
803                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
804                 atomic64_inc(&cntr->out_of_mem_drop_cnt);
805                 return -ENOMEM;
806         }
807
808         /* increment refcnt for context */
809         hl_ctx_get(hdev, ctx);
810
811         cs->ctx = ctx;
812         cs->submitted = false;
813         cs->completed = false;
814         cs->type = cs_type;
815         cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
816         cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
817         cs->timeout_jiffies = timeout;
818         cs->skip_reset_on_timeout =
819                 hdev->reset_info.skip_reset_on_timeout ||
820                 !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
821         cs->submission_time_jiffies = jiffies;
822         INIT_LIST_HEAD(&cs->job_list);
823         INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
824         kref_init(&cs->refcount);
825         spin_lock_init(&cs->job_lock);
826
827         cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
828         if (!cs_cmpl)
829                 cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_KERNEL);
830
831         if (!cs_cmpl) {
832                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
833                 atomic64_inc(&cntr->out_of_mem_drop_cnt);
834                 rc = -ENOMEM;
835                 goto free_cs;
836         }
837
838         cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
839                         sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
840         if (!cs->jobs_in_queue_cnt)
841                 cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
842                                 sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL);
843
844         if (!cs->jobs_in_queue_cnt) {
845                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
846                 atomic64_inc(&cntr->out_of_mem_drop_cnt);
847                 rc = -ENOMEM;
848                 goto free_cs_cmpl;
849         }
850
851         cs_cmpl->hdev = hdev;
852         cs_cmpl->type = cs->type;
853         spin_lock_init(&cs_cmpl->lock);
854         cs->fence = &cs_cmpl->base_fence;
855
856         spin_lock(&ctx->cs_lock);
857
858         cs_cmpl->cs_seq = ctx->cs_sequence;
859         other = ctx->cs_pending[cs_cmpl->cs_seq &
860                                 (hdev->asic_prop.max_pending_cs - 1)];
861
862         if (other && !completion_done(&other->completion)) {
863                 /* If the following statement is true, it means we have reached
864                  * a point in which only part of the staged submission was
865                  * submitted and we don't have enough room in the 'cs_pending'
866                  * array for the rest of the submission.
867                  * This causes a deadlock because this CS will never be
868                  * completed as it depends on future CS's for completion.
869                  */
870                 if (other->cs_sequence == user_sequence)
871                         dev_crit_ratelimited(hdev->dev,
872                                 "Staged CS %llu deadlock due to lack of resources",
873                                 user_sequence);
874
875                 dev_dbg_ratelimited(hdev->dev,
876                         "Rejecting CS because of too many in-flights CS\n");
877                 atomic64_inc(&ctx->cs_counters.max_cs_in_flight_drop_cnt);
878                 atomic64_inc(&cntr->max_cs_in_flight_drop_cnt);
879                 rc = -EAGAIN;
880                 goto free_fence;
881         }
882
883         /* init hl_fence */
884         hl_fence_init(&cs_cmpl->base_fence, cs_cmpl->cs_seq);
885
886         cs->sequence = cs_cmpl->cs_seq;
887
888         ctx->cs_pending[cs_cmpl->cs_seq &
889                         (hdev->asic_prop.max_pending_cs - 1)] =
890                                                         &cs_cmpl->base_fence;
891         ctx->cs_sequence++;
892
893         hl_fence_get(&cs_cmpl->base_fence);
894
895         hl_fence_put(other);
896
897         spin_unlock(&ctx->cs_lock);
898
899         *cs_new = cs;
900
901         return 0;
902
903 free_fence:
904         spin_unlock(&ctx->cs_lock);
905         kfree(cs->jobs_in_queue_cnt);
906 free_cs_cmpl:
907         kfree(cs_cmpl);
908 free_cs:
909         kfree(cs);
910         hl_ctx_put(ctx);
911         return rc;
912 }
913
914 static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
915 {
916         struct hl_cs_job *job, *tmp;
917
918         staged_cs_put(hdev, cs);
919
920         list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
921                 complete_job(hdev, job);
922 }
923
924 void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
925 {
926         int i;
927         struct hl_cs *cs, *tmp;
928
929         if (!skip_wq_flush) {
930                 flush_workqueue(hdev->ts_free_obj_wq);
931
932                 /* flush all completions before iterating over the CS mirror list in
933                  * order to avoid a race with the release functions
934                  */
935                 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
936                         flush_workqueue(hdev->cq_wq[i]);
937
938         }
939
940         /* Make sure we don't have leftovers in the CS mirror list */
941         list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) {
942                 cs_get(cs);
943                 cs->aborted = true;
944                 dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
945                                 cs->ctx->asid, cs->sequence);
946                 cs_rollback(hdev, cs);
947                 cs_put(cs);
948         }
949
950         force_complete_multi_cs(hdev);
951 }
952
953 static void
954 wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
955 {
956         struct hl_user_pending_interrupt *pend, *temp;
957         unsigned long flags;
958
959         spin_lock_irqsave(&interrupt->wait_list_lock, flags);
960         list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, wait_list_node) {
961                 if (pend->ts_reg_info.ts_buff) {
962                         list_del(&pend->wait_list_node);
963                         hl_ts_put(pend->ts_reg_info.ts_buff);
964                         hl_cb_put(pend->ts_reg_info.cq_cb);
965                 } else {
966                         pend->fence.error = -EIO;
967                         complete_all(&pend->fence.completion);
968                 }
969         }
970         spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
971 }
972
973 void hl_release_pending_user_interrupts(struct hl_device *hdev)
974 {
975         struct asic_fixed_properties *prop = &hdev->asic_prop;
976         struct hl_user_interrupt *interrupt;
977         int i;
978
979         if (!prop->user_interrupt_count)
980                 return;
981
982         /* We iterate through the user interrupt requests and waking up all
983          * user threads waiting for interrupt completion. We iterate the
984          * list under a lock, this is why all user threads, once awake,
985          * will wait on the same lock and will release the waiting object upon
986          * unlock.
987          */
988
989         for (i = 0 ; i < prop->user_interrupt_count ; i++) {
990                 interrupt = &hdev->user_interrupt[i];
991                 wake_pending_user_interrupt_threads(interrupt);
992         }
993
994         interrupt = &hdev->common_user_interrupt;
995         wake_pending_user_interrupt_threads(interrupt);
996 }
997
998 static void job_wq_completion(struct work_struct *work)
999 {
1000         struct hl_cs_job *job = container_of(work, struct hl_cs_job,
1001                                                 finish_work);
1002         struct hl_cs *cs = job->cs;
1003         struct hl_device *hdev = cs->ctx->hdev;
1004
1005         /* job is no longer needed */
1006         complete_job(hdev, job);
1007 }
1008
1009 static int validate_queue_index(struct hl_device *hdev,
1010                                 struct hl_cs_chunk *chunk,
1011                                 enum hl_queue_type *queue_type,
1012                                 bool *is_kernel_allocated_cb)
1013 {
1014         struct asic_fixed_properties *asic = &hdev->asic_prop;
1015         struct hw_queue_properties *hw_queue_prop;
1016
1017         /* This must be checked here to prevent out-of-bounds access to
1018          * hw_queues_props array
1019          */
1020         if (chunk->queue_index >= asic->max_queues) {
1021                 dev_err(hdev->dev, "Queue index %d is invalid\n",
1022                         chunk->queue_index);
1023                 return -EINVAL;
1024         }
1025
1026         hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
1027
1028         if (hw_queue_prop->type == QUEUE_TYPE_NA) {
1029                 dev_err(hdev->dev, "Queue index %d is invalid\n",
1030                         chunk->queue_index);
1031                 return -EINVAL;
1032         }
1033
1034         if (hw_queue_prop->driver_only) {
1035                 dev_err(hdev->dev,
1036                         "Queue index %d is restricted for the kernel driver\n",
1037                         chunk->queue_index);
1038                 return -EINVAL;
1039         }
1040
1041         /* When hw queue type isn't QUEUE_TYPE_HW,
1042          * USER_ALLOC_CB flag shall be referred as "don't care".
1043          */
1044         if (hw_queue_prop->type == QUEUE_TYPE_HW) {
1045                 if (chunk->cs_chunk_flags & HL_CS_CHUNK_FLAGS_USER_ALLOC_CB) {
1046                         if (!(hw_queue_prop->cb_alloc_flags & CB_ALLOC_USER)) {
1047                                 dev_err(hdev->dev,
1048                                         "Queue index %d doesn't support user CB\n",
1049                                         chunk->queue_index);
1050                                 return -EINVAL;
1051                         }
1052
1053                         *is_kernel_allocated_cb = false;
1054                 } else {
1055                         if (!(hw_queue_prop->cb_alloc_flags &
1056                                         CB_ALLOC_KERNEL)) {
1057                                 dev_err(hdev->dev,
1058                                         "Queue index %d doesn't support kernel CB\n",
1059                                         chunk->queue_index);
1060                                 return -EINVAL;
1061                         }
1062
1063                         *is_kernel_allocated_cb = true;
1064                 }
1065         } else {
1066                 *is_kernel_allocated_cb = !!(hw_queue_prop->cb_alloc_flags
1067                                                 & CB_ALLOC_KERNEL);
1068         }
1069
1070         *queue_type = hw_queue_prop->type;
1071         return 0;
1072 }
1073
1074 static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
1075                                         struct hl_cb_mgr *cb_mgr,
1076                                         struct hl_cs_chunk *chunk)
1077 {
1078         struct hl_cb *cb;
1079         u32 cb_handle;
1080
1081         cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);
1082
1083         cb = hl_cb_get(hdev, cb_mgr, cb_handle);
1084         if (!cb) {
1085                 dev_err(hdev->dev, "CB handle 0x%x invalid\n", cb_handle);
1086                 return NULL;
1087         }
1088
1089         if ((chunk->cb_size < 8) || (chunk->cb_size > cb->size)) {
1090                 dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size);
1091                 goto release_cb;
1092         }
1093
1094         atomic_inc(&cb->cs_cnt);
1095
1096         return cb;
1097
1098 release_cb:
1099         hl_cb_put(cb);
1100         return NULL;
1101 }
1102
1103 struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
1104                 enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
1105 {
1106         struct hl_cs_job *job;
1107
1108         job = kzalloc(sizeof(*job), GFP_ATOMIC);
1109         if (!job)
1110                 job = kzalloc(sizeof(*job), GFP_KERNEL);
1111
1112         if (!job)
1113                 return NULL;
1114
1115         kref_init(&job->refcount);
1116         job->queue_type = queue_type;
1117         job->is_kernel_allocated_cb = is_kernel_allocated_cb;
1118
1119         if (is_cb_patched(hdev, job))
1120                 INIT_LIST_HEAD(&job->userptr_list);
1121
1122         if (job->queue_type == QUEUE_TYPE_EXT)
1123                 INIT_WORK(&job->finish_work, job_wq_completion);
1124
1125         return job;
1126 }
1127
1128 static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
1129 {
1130         if (cs_type_flags & HL_CS_FLAGS_SIGNAL)
1131                 return CS_TYPE_SIGNAL;
1132         else if (cs_type_flags & HL_CS_FLAGS_WAIT)
1133                 return CS_TYPE_WAIT;
1134         else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
1135                 return CS_TYPE_COLLECTIVE_WAIT;
1136         else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY)
1137                 return CS_RESERVE_SIGNALS;
1138         else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
1139                 return CS_UNRESERVE_SIGNALS;
1140         else
1141                 return CS_TYPE_DEFAULT;
1142 }
1143
1144 static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
1145 {
1146         struct hl_device *hdev = hpriv->hdev;
1147         struct hl_ctx *ctx = hpriv->ctx;
1148         u32 cs_type_flags, num_chunks;
1149         enum hl_device_status status;
1150         enum hl_cs_type cs_type;
1151
1152         if (!hl_device_operational(hdev, &status)) {
1153                 return -EBUSY;
1154         }
1155
1156         if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
1157                         !hdev->supports_staged_submission) {
1158                 dev_err(hdev->dev, "staged submission not supported");
1159                 return -EPERM;
1160         }
1161
1162         cs_type_flags = args->in.cs_flags & HL_CS_FLAGS_TYPE_MASK;
1163
1164         if (unlikely(cs_type_flags && !is_power_of_2(cs_type_flags))) {
1165                 dev_err(hdev->dev,
1166                         "CS type flags are mutually exclusive, context %d\n",
1167                         ctx->asid);
1168                 return -EINVAL;
1169         }
1170
1171         cs_type = hl_cs_get_cs_type(cs_type_flags);
1172         num_chunks = args->in.num_chunks_execute;
1173
1174         if (unlikely((cs_type != CS_TYPE_DEFAULT) &&
1175                                         !hdev->supports_sync_stream)) {
1176                 dev_err(hdev->dev, "Sync stream CS is not supported\n");
1177                 return -EINVAL;
1178         }
1179
1180         if (cs_type == CS_TYPE_DEFAULT) {
1181                 if (!num_chunks) {
1182                         dev_err(hdev->dev,
1183                                 "Got execute CS with 0 chunks, context %d\n",
1184                                 ctx->asid);
1185                         return -EINVAL;
1186                 }
1187         } else if (num_chunks != 1) {
1188                 dev_err(hdev->dev,
1189                         "Sync stream CS mandates one chunk only, context %d\n",
1190                         ctx->asid);
1191                 return -EINVAL;
1192         }
1193
1194         return 0;
1195 }
1196
1197 static int hl_cs_copy_chunk_array(struct hl_device *hdev,
1198                                         struct hl_cs_chunk **cs_chunk_array,
1199                                         void __user *chunks, u32 num_chunks,
1200                                         struct hl_ctx *ctx)
1201 {
1202         u32 size_to_copy;
1203
1204         if (num_chunks > HL_MAX_JOBS_PER_CS) {
1205                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1206                 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1207                 dev_err(hdev->dev,
1208                         "Number of chunks can NOT be larger than %d\n",
1209                         HL_MAX_JOBS_PER_CS);
1210                 return -EINVAL;
1211         }
1212
1213         *cs_chunk_array = kmalloc_array(num_chunks, sizeof(**cs_chunk_array),
1214                                         GFP_ATOMIC);
1215         if (!*cs_chunk_array)
1216                 *cs_chunk_array = kmalloc_array(num_chunks,
1217                                         sizeof(**cs_chunk_array), GFP_KERNEL);
1218         if (!*cs_chunk_array) {
1219                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1220                 atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
1221                 return -ENOMEM;
1222         }
1223
1224         size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
1225         if (copy_from_user(*cs_chunk_array, chunks, size_to_copy)) {
1226                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1227                 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1228                 dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
1229                 kfree(*cs_chunk_array);
1230                 return -EFAULT;
1231         }
1232
1233         return 0;
1234 }
1235
1236 static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
1237                                 u64 sequence, u32 flags,
1238                                 u32 encaps_signal_handle)
1239 {
1240         if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION))
1241                 return 0;
1242
1243         cs->staged_last = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_LAST);
1244         cs->staged_first = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST);
1245
1246         if (cs->staged_first) {
1247                 /* Staged CS sequence is the first CS sequence */
1248                 INIT_LIST_HEAD(&cs->staged_cs_node);
1249                 cs->staged_sequence = cs->sequence;
1250
1251                 if (cs->encaps_signals)
1252                         cs->encaps_sig_hdl_id = encaps_signal_handle;
1253         } else {
1254                 /* User sequence will be validated in 'hl_hw_queue_schedule_cs'
1255                  * under the cs_mirror_lock
1256                  */
1257                 cs->staged_sequence = sequence;
1258         }
1259
1260         /* Increment CS reference if needed */
1261         staged_cs_get(hdev, cs);
1262
1263         cs->staged_cs = true;
1264
1265         return 0;
1266 }
1267
1268 static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
1269 {
1270         int i;
1271
1272         for (i = 0; i < hdev->stream_master_qid_arr_size; i++)
1273                 if (qid == hdev->stream_master_qid_arr[i])
1274                         return BIT(i);
1275
1276         return 0;
1277 }
1278
1279 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
1280                                 u32 num_chunks, u64 *cs_seq, u32 flags,
1281                                 u32 encaps_signals_handle, u32 timeout,
1282                                 u16 *signal_initial_sob_count)
1283 {
1284         bool staged_mid, int_queues_only = true;
1285         struct hl_device *hdev = hpriv->hdev;
1286         struct hl_cs_chunk *cs_chunk_array;
1287         struct hl_cs_counters_atomic *cntr;
1288         struct hl_ctx *ctx = hpriv->ctx;
1289         struct hl_cs_job *job;
1290         struct hl_cs *cs;
1291         struct hl_cb *cb;
1292         u64 user_sequence;
1293         u8 stream_master_qid_map = 0;
1294         int rc, i;
1295
1296         cntr = &hdev->aggregated_cs_counters;
1297         user_sequence = *cs_seq;
1298         *cs_seq = ULLONG_MAX;
1299
1300         rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
1301                         hpriv->ctx);
1302         if (rc)
1303                 goto out;
1304
1305         if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
1306                         !(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
1307                 staged_mid = true;
1308         else
1309                 staged_mid = false;
1310
1311         rc = allocate_cs(hdev, hpriv->ctx, CS_TYPE_DEFAULT,
1312                         staged_mid ? user_sequence : ULLONG_MAX, &cs, flags,
1313                         timeout);
1314         if (rc)
1315                 goto free_cs_chunk_array;
1316
1317         *cs_seq = cs->sequence;
1318
1319         hl_debugfs_add_cs(cs);
1320
1321         rc = cs_staged_submission(hdev, cs, user_sequence, flags,
1322                                                 encaps_signals_handle);
1323         if (rc)
1324                 goto free_cs_object;
1325
1326         /* If this is a staged submission we must return the staged sequence
1327          * rather than the internal CS sequence
1328          */
1329         if (cs->staged_cs)
1330                 *cs_seq = cs->staged_sequence;
1331
1332         /* Validate ALL the CS chunks before submitting the CS */
1333         for (i = 0 ; i < num_chunks ; i++) {
1334                 struct hl_cs_chunk *chunk = &cs_chunk_array[i];
1335                 enum hl_queue_type queue_type;
1336                 bool is_kernel_allocated_cb;
1337
1338                 rc = validate_queue_index(hdev, chunk, &queue_type,
1339                                                 &is_kernel_allocated_cb);
1340                 if (rc) {
1341                         atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1342                         atomic64_inc(&cntr->validation_drop_cnt);
1343                         goto free_cs_object;
1344                 }
1345
1346                 if (is_kernel_allocated_cb) {
1347                         cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
1348                         if (!cb) {
1349                                 atomic64_inc(
1350                                         &ctx->cs_counters.validation_drop_cnt);
1351                                 atomic64_inc(&cntr->validation_drop_cnt);
1352                                 rc = -EINVAL;
1353                                 goto free_cs_object;
1354                         }
1355                 } else {
1356                         cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
1357                 }
1358
1359                 if (queue_type == QUEUE_TYPE_EXT ||
1360                                                 queue_type == QUEUE_TYPE_HW) {
1361                         int_queues_only = false;
1362
1363                         /*
1364                          * store which stream are being used for external/HW
1365                          * queues of this CS
1366                          */
1367                         if (hdev->supports_wait_for_multi_cs)
1368                                 stream_master_qid_map |=
1369                                         get_stream_master_qid_mask(hdev,
1370                                                         chunk->queue_index);
1371                 }
1372
1373                 job = hl_cs_allocate_job(hdev, queue_type,
1374                                                 is_kernel_allocated_cb);
1375                 if (!job) {
1376                         atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1377                         atomic64_inc(&cntr->out_of_mem_drop_cnt);
1378                         dev_err(hdev->dev, "Failed to allocate a new job\n");
1379                         rc = -ENOMEM;
1380                         if (is_kernel_allocated_cb)
1381                                 goto release_cb;
1382
1383                         goto free_cs_object;
1384                 }
1385
1386                 job->id = i + 1;
1387                 job->cs = cs;
1388                 job->user_cb = cb;
1389                 job->user_cb_size = chunk->cb_size;
1390                 job->hw_queue_id = chunk->queue_index;
1391
1392                 cs->jobs_in_queue_cnt[job->hw_queue_id]++;
1393
1394                 list_add_tail(&job->cs_node, &cs->job_list);
1395
1396                 /*
1397                  * Increment CS reference. When CS reference is 0, CS is
1398                  * done and can be signaled to user and free all its resources
1399                  * Only increment for JOB on external or H/W queues, because
1400                  * only for those JOBs we get completion
1401                  */
1402                 if (cs_needs_completion(cs) &&
1403                         (job->queue_type == QUEUE_TYPE_EXT ||
1404                                 job->queue_type == QUEUE_TYPE_HW))
1405                         cs_get(cs);
1406
1407                 hl_debugfs_add_job(hdev, job);
1408
1409                 rc = cs_parser(hpriv, job);
1410                 if (rc) {
1411                         atomic64_inc(&ctx->cs_counters.parsing_drop_cnt);
1412                         atomic64_inc(&cntr->parsing_drop_cnt);
1413                         dev_err(hdev->dev,
1414                                 "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
1415                                 cs->ctx->asid, cs->sequence, job->id, rc);
1416                         goto free_cs_object;
1417                 }
1418         }
1419
1420         /* We allow a CS with any queue type combination as long as it does
1421          * not get a completion
1422          */
1423         if (int_queues_only && cs_needs_completion(cs)) {
1424                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1425                 atomic64_inc(&cntr->validation_drop_cnt);
1426                 dev_err(hdev->dev,
1427                         "Reject CS %d.%llu since it contains only internal queues jobs and needs completion\n",
1428                         cs->ctx->asid, cs->sequence);
1429                 rc = -EINVAL;
1430                 goto free_cs_object;
1431         }
1432
1433         /*
1434          * store the (external/HW queues) streams used by the CS in the
1435          * fence object for multi-CS completion
1436          */
1437         if (hdev->supports_wait_for_multi_cs)
1438                 cs->fence->stream_master_qid_map = stream_master_qid_map;
1439
1440         rc = hl_hw_queue_schedule_cs(cs);
1441         if (rc) {
1442                 if (rc != -EAGAIN)
1443                         dev_err(hdev->dev,
1444                                 "Failed to submit CS %d.%llu to H/W queues, error %d\n",
1445                                 cs->ctx->asid, cs->sequence, rc);
1446                 goto free_cs_object;
1447         }
1448
1449         *signal_initial_sob_count = cs->initial_sob_count;
1450
1451         rc = HL_CS_STATUS_SUCCESS;
1452         goto put_cs;
1453
1454 release_cb:
1455         atomic_dec(&cb->cs_cnt);
1456         hl_cb_put(cb);
1457 free_cs_object:
1458         cs_rollback(hdev, cs);
1459         *cs_seq = ULLONG_MAX;
1460         /* The path below is both for good and erroneous exits */
1461 put_cs:
1462         /* We finished with the CS in this function, so put the ref */
1463         cs_put(cs);
1464 free_cs_chunk_array:
1465         kfree(cs_chunk_array);
1466 out:
1467         return rc;
1468 }
1469
1470 static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
1471                                 u64 *cs_seq)
1472 {
1473         struct hl_device *hdev = hpriv->hdev;
1474         struct hl_ctx *ctx = hpriv->ctx;
1475         bool need_soft_reset = false;
1476         int rc = 0, do_ctx_switch;
1477         void __user *chunks;
1478         u32 num_chunks, tmp;
1479         u16 sob_count;
1480         int ret;
1481
1482         do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0);
1483
1484         if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
1485                 mutex_lock(&hpriv->restore_phase_mutex);
1486
1487                 if (do_ctx_switch) {
1488                         rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
1489                         if (rc) {
1490                                 dev_err_ratelimited(hdev->dev,
1491                                         "Failed to switch to context %d, rejecting CS! %d\n",
1492                                         ctx->asid, rc);
1493                                 /*
1494                                  * If we timedout, or if the device is not IDLE
1495                                  * while we want to do context-switch (-EBUSY),
1496                                  * we need to soft-reset because QMAN is
1497                                  * probably stuck. However, we can't call to
1498                                  * reset here directly because of deadlock, so
1499                                  * need to do it at the very end of this
1500                                  * function
1501                                  */
1502                                 if ((rc == -ETIMEDOUT) || (rc == -EBUSY))
1503                                         need_soft_reset = true;
1504                                 mutex_unlock(&hpriv->restore_phase_mutex);
1505                                 goto out;
1506                         }
1507                 }
1508
1509                 hdev->asic_funcs->restore_phase_topology(hdev);
1510
1511                 chunks = (void __user *) (uintptr_t) args->in.chunks_restore;
1512                 num_chunks = args->in.num_chunks_restore;
1513
1514                 if (!num_chunks) {
1515                         dev_dbg(hdev->dev,
1516                                 "Need to run restore phase but restore CS is empty\n");
1517                         rc = 0;
1518                 } else {
1519                         rc = cs_ioctl_default(hpriv, chunks, num_chunks,
1520                                         cs_seq, 0, 0, hdev->timeout_jiffies, &sob_count);
1521                 }
1522
1523                 mutex_unlock(&hpriv->restore_phase_mutex);
1524
1525                 if (rc) {
1526                         dev_err(hdev->dev,
1527                                 "Failed to submit restore CS for context %d (%d)\n",
1528                                 ctx->asid, rc);
1529                         goto out;
1530                 }
1531
1532                 /* Need to wait for restore completion before execution phase */
1533                 if (num_chunks) {
1534                         enum hl_cs_wait_status status;
1535 wait_again:
1536                         ret = _hl_cs_wait_ioctl(hdev, ctx,
1537                                         jiffies_to_usecs(hdev->timeout_jiffies),
1538                                         *cs_seq, &status, NULL);
1539                         if (ret) {
1540                                 if (ret == -ERESTARTSYS) {
1541                                         usleep_range(100, 200);
1542                                         goto wait_again;
1543                                 }
1544
1545                                 dev_err(hdev->dev,
1546                                         "Restore CS for context %d failed to complete %d\n",
1547                                         ctx->asid, ret);
1548                                 rc = -ENOEXEC;
1549                                 goto out;
1550                         }
1551                 }
1552
1553                 ctx->thread_ctx_switch_wait_token = 1;
1554
1555         } else if (!ctx->thread_ctx_switch_wait_token) {
1556                 rc = hl_poll_timeout_memory(hdev,
1557                         &ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1),
1558                         100, jiffies_to_usecs(hdev->timeout_jiffies), false);
1559
1560                 if (rc == -ETIMEDOUT) {
1561                         dev_err(hdev->dev,
1562                                 "context switch phase timeout (%d)\n", tmp);
1563                         goto out;
1564                 }
1565         }
1566
1567 out:
1568         if ((rc == -ETIMEDOUT || rc == -EBUSY) && (need_soft_reset))
1569                 hl_device_reset(hdev, 0);
1570
1571         return rc;
1572 }
1573
1574 /*
1575  * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case.
1576  * if the SOB value reaches the max value move to the other SOB reserved
1577  * to the queue.
1578  * @hdev: pointer to device structure
1579  * @q_idx: stream queue index
1580  * @hw_sob: the H/W SOB used in this signal CS.
1581  * @count: signals count
1582  * @encaps_sig: tells whether it's reservation for encaps signals or not.
1583  *
1584  * Note that this function must be called while hw_queues_lock is taken.
1585  */
1586 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
1587                         struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig)
1588
1589 {
1590         struct hl_sync_stream_properties *prop;
1591         struct hl_hw_sob *sob = *hw_sob, *other_sob;
1592         u8 other_sob_offset;
1593
1594         prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
1595
1596         hw_sob_get(sob);
1597
1598         /* check for wraparound */
1599         if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) {
1600                 /*
1601                  * Decrement as we reached the max value.
1602                  * The release function won't be called here as we've
1603                  * just incremented the refcount right before calling this
1604                  * function.
1605                  */
1606                 hw_sob_put_err(sob);
1607
1608                 /*
1609                  * check the other sob value, if it still in use then fail
1610                  * otherwise make the switch
1611                  */
1612                 other_sob_offset = (prop->curr_sob_offset + 1) % HL_RSVD_SOBS;
1613                 other_sob = &prop->hw_sob[other_sob_offset];
1614
1615                 if (kref_read(&other_sob->kref) != 1) {
1616                         dev_err(hdev->dev, "error: Cannot switch SOBs q_idx: %d\n",
1617                                                                 q_idx);
1618                         return -EINVAL;
1619                 }
1620
1621                 /*
1622                  * next_sob_val always points to the next available signal
1623                  * in the sob, so in encaps signals it will be the next one
1624                  * after reserving the required amount.
1625                  */
1626                 if (encaps_sig)
1627                         prop->next_sob_val = count + 1;
1628                 else
1629                         prop->next_sob_val = count;
1630
1631                 /* only two SOBs are currently in use */
1632                 prop->curr_sob_offset = other_sob_offset;
1633                 *hw_sob = other_sob;
1634
1635                 /*
1636                  * check if other_sob needs reset, then do it before using it
1637                  * for the reservation or the next signal cs.
1638                  * we do it here, and for both encaps and regular signal cs
1639                  * cases in order to avoid possible races of two kref_put
1640                  * of the sob which can occur at the same time if we move the
1641                  * sob reset(kref_put) to cs_do_release function.
1642                  * in addition, if we have combination of cs signal and
1643                  * encaps, and at the point we need to reset the sob there was
1644                  * no more reservations and only signal cs keep coming,
1645                  * in such case we need signal_cs to put the refcount and
1646                  * reset the sob.
1647                  */
1648                 if (other_sob->need_reset)
1649                         hw_sob_put(other_sob);
1650
1651                 if (encaps_sig) {
1652                         /* set reset indication for the sob */
1653                         sob->need_reset = true;
1654                         hw_sob_get(other_sob);
1655                 }
1656
1657                 dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
1658                                 prop->curr_sob_offset, q_idx);
1659         } else {
1660                 prop->next_sob_val += count;
1661         }
1662
1663         return 0;
1664 }
1665
1666 static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
1667                 struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx,
1668                 bool encaps_signals)
1669 {
1670         u64 *signal_seq_arr = NULL;
1671         u32 size_to_copy, signal_seq_arr_len;
1672         int rc = 0;
1673
1674         if (encaps_signals) {
1675                 *signal_seq = chunk->encaps_signal_seq;
1676                 return 0;
1677         }
1678
1679         signal_seq_arr_len = chunk->num_signal_seq_arr;
1680
1681         /* currently only one signal seq is supported */
1682         if (signal_seq_arr_len != 1) {
1683                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1684                 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1685                 dev_err(hdev->dev,
1686                         "Wait for signal CS supports only one signal CS seq\n");
1687                 return -EINVAL;
1688         }
1689
1690         signal_seq_arr = kmalloc_array(signal_seq_arr_len,
1691                                         sizeof(*signal_seq_arr),
1692                                         GFP_ATOMIC);
1693         if (!signal_seq_arr)
1694                 signal_seq_arr = kmalloc_array(signal_seq_arr_len,
1695                                         sizeof(*signal_seq_arr),
1696                                         GFP_KERNEL);
1697         if (!signal_seq_arr) {
1698                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1699                 atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
1700                 return -ENOMEM;
1701         }
1702
1703         size_to_copy = signal_seq_arr_len * sizeof(*signal_seq_arr);
1704         if (copy_from_user(signal_seq_arr,
1705                                 u64_to_user_ptr(chunk->signal_seq_arr),
1706                                 size_to_copy)) {
1707                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1708                 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1709                 dev_err(hdev->dev,
1710                         "Failed to copy signal seq array from user\n");
1711                 rc = -EFAULT;
1712                 goto out;
1713         }
1714
1715         /* currently it is guaranteed to have only one signal seq */
1716         *signal_seq = signal_seq_arr[0];
1717
1718 out:
1719         kfree(signal_seq_arr);
1720
1721         return rc;
1722 }
1723
1724 static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
1725                 struct hl_ctx *ctx, struct hl_cs *cs,
1726                 enum hl_queue_type q_type, u32 q_idx, u32 encaps_signal_offset)
1727 {
1728         struct hl_cs_counters_atomic *cntr;
1729         struct hl_cs_job *job;
1730         struct hl_cb *cb;
1731         u32 cb_size;
1732
1733         cntr = &hdev->aggregated_cs_counters;
1734
1735         job = hl_cs_allocate_job(hdev, q_type, true);
1736         if (!job) {
1737                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1738                 atomic64_inc(&cntr->out_of_mem_drop_cnt);
1739                 dev_err(hdev->dev, "Failed to allocate a new job\n");
1740                 return -ENOMEM;
1741         }
1742
1743         if (cs->type == CS_TYPE_WAIT)
1744                 cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
1745         else
1746                 cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
1747
1748         cb = hl_cb_kernel_create(hdev, cb_size,
1749                                 q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
1750         if (!cb) {
1751                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1752                 atomic64_inc(&cntr->out_of_mem_drop_cnt);
1753                 kfree(job);
1754                 return -EFAULT;
1755         }
1756
1757         job->id = 0;
1758         job->cs = cs;
1759         job->user_cb = cb;
1760         atomic_inc(&job->user_cb->cs_cnt);
1761         job->user_cb_size = cb_size;
1762         job->hw_queue_id = q_idx;
1763
1764         if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
1765                         && cs->encaps_signals)
1766                 job->encaps_sig_wait_offset = encaps_signal_offset;
1767         /*
1768          * No need in parsing, user CB is the patched CB.
1769          * We call hl_cb_destroy() out of two reasons - we don't need the CB in
1770          * the CB idr anymore and to decrement its refcount as it was
1771          * incremented inside hl_cb_kernel_create().
1772          */
1773         job->patched_cb = job->user_cb;
1774         job->job_cb_size = job->user_cb_size;
1775         hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
1776
1777         /* increment refcount as for external queues we get completion */
1778         cs_get(cs);
1779
1780         cs->jobs_in_queue_cnt[job->hw_queue_id]++;
1781
1782         list_add_tail(&job->cs_node, &cs->job_list);
1783
1784         hl_debugfs_add_job(hdev, job);
1785
1786         return 0;
1787 }
1788
1789 static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
1790                                 u32 q_idx, u32 count,
1791                                 u32 *handle_id, u32 *sob_addr,
1792                                 u32 *signals_count)
1793 {
1794         struct hw_queue_properties *hw_queue_prop;
1795         struct hl_sync_stream_properties *prop;
1796         struct hl_device *hdev = hpriv->hdev;
1797         struct hl_cs_encaps_sig_handle *handle;
1798         struct hl_encaps_signals_mgr *mgr;
1799         struct hl_hw_sob *hw_sob;
1800         int hdl_id;
1801         int rc = 0;
1802
1803         if (count >= HL_MAX_SOB_VAL) {
1804                 dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n",
1805                                                 count);
1806                 rc = -EINVAL;
1807                 goto out;
1808         }
1809
1810         if (q_idx >= hdev->asic_prop.max_queues) {
1811                 dev_err(hdev->dev, "Queue index %d is invalid\n",
1812                         q_idx);
1813                 rc = -EINVAL;
1814                 goto out;
1815         }
1816
1817         hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
1818
1819         if (!hw_queue_prop->supports_sync_stream) {
1820                 dev_err(hdev->dev,
1821                         "Queue index %d does not support sync stream operations\n",
1822                                                                         q_idx);
1823                 rc = -EINVAL;
1824                 goto out;
1825         }
1826
1827         prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
1828
1829         handle = kzalloc(sizeof(*handle), GFP_KERNEL);
1830         if (!handle) {
1831                 rc = -ENOMEM;
1832                 goto out;
1833         }
1834
1835         handle->count = count;
1836
1837         hl_ctx_get(hdev, hpriv->ctx);
1838         handle->ctx = hpriv->ctx;
1839         mgr = &hpriv->ctx->sig_mgr;
1840
1841         spin_lock(&mgr->lock);
1842         hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_ATOMIC);
1843         spin_unlock(&mgr->lock);
1844
1845         if (hdl_id < 0) {
1846                 dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n");
1847                 rc = -EINVAL;
1848                 goto put_ctx;
1849         }
1850
1851         handle->id = hdl_id;
1852         handle->q_idx = q_idx;
1853         handle->hdev = hdev;
1854         kref_init(&handle->refcount);
1855
1856         hdev->asic_funcs->hw_queues_lock(hdev);
1857
1858         hw_sob = &prop->hw_sob[prop->curr_sob_offset];
1859
1860         /*
1861          * Increment the SOB value by count by user request
1862          * to reserve those signals
1863          * check if the signals amount to reserve is not exceeding the max sob
1864          * value, if yes then switch sob.
1865          */
1866         rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count,
1867                                                                 true);
1868         if (rc) {
1869                 dev_err(hdev->dev, "Failed to switch SOB\n");
1870                 hdev->asic_funcs->hw_queues_unlock(hdev);
1871                 rc = -EINVAL;
1872                 goto remove_idr;
1873         }
1874         /* set the hw_sob to the handle after calling the sob wraparound handler
1875          * since sob could have changed.
1876          */
1877         handle->hw_sob = hw_sob;
1878
1879         /* store the current sob value for unreserve validity check, and
1880          * signal offset support
1881          */
1882         handle->pre_sob_val = prop->next_sob_val - handle->count;
1883
1884         *signals_count = prop->next_sob_val;
1885         hdev->asic_funcs->hw_queues_unlock(hdev);
1886
1887         *sob_addr = handle->hw_sob->sob_addr;
1888         *handle_id = hdl_id;
1889
1890         dev_dbg(hdev->dev,
1891                 "Signals reserved, sob_id: %d, sob addr: 0x%x, last sob_val: %u, q_idx: %d, hdl_id: %d\n",
1892                         hw_sob->sob_id, handle->hw_sob->sob_addr,
1893                         prop->next_sob_val - 1, q_idx, hdl_id);
1894         goto out;
1895
1896 remove_idr:
1897         spin_lock(&mgr->lock);
1898         idr_remove(&mgr->handles, hdl_id);
1899         spin_unlock(&mgr->lock);
1900
1901 put_ctx:
1902         hl_ctx_put(handle->ctx);
1903         kfree(handle);
1904
1905 out:
1906         return rc;
1907 }
1908
1909 static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
1910 {
1911         struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
1912         struct hl_sync_stream_properties *prop;
1913         struct hl_device *hdev = hpriv->hdev;
1914         struct hl_encaps_signals_mgr *mgr;
1915         struct hl_hw_sob *hw_sob;
1916         u32 q_idx, sob_addr;
1917         int rc = 0;
1918
1919         mgr = &hpriv->ctx->sig_mgr;
1920
1921         spin_lock(&mgr->lock);
1922         encaps_sig_hdl = idr_find(&mgr->handles, handle_id);
1923         if (encaps_sig_hdl) {
1924                 dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n",
1925                                 handle_id, encaps_sig_hdl->hw_sob->sob_addr,
1926                                         encaps_sig_hdl->count);
1927
1928                 hdev->asic_funcs->hw_queues_lock(hdev);
1929
1930                 q_idx = encaps_sig_hdl->q_idx;
1931                 prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
1932                 hw_sob = &prop->hw_sob[prop->curr_sob_offset];
1933                 sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
1934
1935                 /* Check if sob_val got out of sync due to other
1936                  * signal submission requests which were handled
1937                  * between the reserve-unreserve calls or SOB switch
1938                  * upon reaching SOB max value.
1939                  */
1940                 if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count
1941                                 != prop->next_sob_val ||
1942                                 sob_addr != encaps_sig_hdl->hw_sob->sob_addr) {
1943                         dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n",
1944                                 encaps_sig_hdl->pre_sob_val,
1945                                 (prop->next_sob_val - encaps_sig_hdl->count));
1946
1947                         hdev->asic_funcs->hw_queues_unlock(hdev);
1948                         rc = -EINVAL;
1949                         goto out;
1950                 }
1951
1952                 /*
1953                  * Decrement the SOB value by count by user request
1954                  * to unreserve those signals
1955                  */
1956                 prop->next_sob_val -= encaps_sig_hdl->count;
1957
1958                 hdev->asic_funcs->hw_queues_unlock(hdev);
1959
1960                 hw_sob_put(hw_sob);
1961
1962                 /* Release the id and free allocated memory of the handle */
1963                 idr_remove(&mgr->handles, handle_id);
1964                 hl_ctx_put(encaps_sig_hdl->ctx);
1965                 kfree(encaps_sig_hdl);
1966         } else {
1967                 rc = -EINVAL;
1968                 dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
1969         }
1970 out:
1971         spin_unlock(&mgr->lock);
1972
1973         return rc;
1974 }
1975
1976 static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
1977                                 void __user *chunks, u32 num_chunks,
1978                                 u64 *cs_seq, u32 flags, u32 timeout,
1979                                 u32 *signal_sob_addr_offset, u16 *signal_initial_sob_count)
1980 {
1981         struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL;
1982         bool handle_found = false, is_wait_cs = false,
1983                         wait_cs_submitted = false,
1984                         cs_encaps_signals = false;
1985         struct hl_cs_chunk *cs_chunk_array, *chunk;
1986         bool staged_cs_with_encaps_signals = false;
1987         struct hw_queue_properties *hw_queue_prop;
1988         struct hl_device *hdev = hpriv->hdev;
1989         struct hl_cs_compl *sig_waitcs_cmpl;
1990         u32 q_idx, collective_engine_id = 0;
1991         struct hl_cs_counters_atomic *cntr;
1992         struct hl_fence *sig_fence = NULL;
1993         struct hl_ctx *ctx = hpriv->ctx;
1994         enum hl_queue_type q_type;
1995         struct hl_cs *cs;
1996         u64 signal_seq;
1997         int rc;
1998
1999         cntr = &hdev->aggregated_cs_counters;
2000         *cs_seq = ULLONG_MAX;
2001
2002         rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
2003                         ctx);
2004         if (rc)
2005                 goto out;
2006
2007         /* currently it is guaranteed to have only one chunk */
2008         chunk = &cs_chunk_array[0];
2009
2010         if (chunk->queue_index >= hdev->asic_prop.max_queues) {
2011                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2012                 atomic64_inc(&cntr->validation_drop_cnt);
2013                 dev_err(hdev->dev, "Queue index %d is invalid\n",
2014                         chunk->queue_index);
2015                 rc = -EINVAL;
2016                 goto free_cs_chunk_array;
2017         }
2018
2019         q_idx = chunk->queue_index;
2020         hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
2021         q_type = hw_queue_prop->type;
2022
2023         if (!hw_queue_prop->supports_sync_stream) {
2024                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2025                 atomic64_inc(&cntr->validation_drop_cnt);
2026                 dev_err(hdev->dev,
2027                         "Queue index %d does not support sync stream operations\n",
2028                         q_idx);
2029                 rc = -EINVAL;
2030                 goto free_cs_chunk_array;
2031         }
2032
2033         if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
2034                 if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
2035                         atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2036                         atomic64_inc(&cntr->validation_drop_cnt);
2037                         dev_err(hdev->dev,
2038                                 "Queue index %d is invalid\n", q_idx);
2039                         rc = -EINVAL;
2040                         goto free_cs_chunk_array;
2041                 }
2042
2043                 if (!hdev->nic_ports_mask) {
2044                         atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2045                         atomic64_inc(&cntr->validation_drop_cnt);
2046                         dev_err(hdev->dev,
2047                                 "Collective operations not supported when NIC ports are disabled");
2048                         rc = -EINVAL;
2049                         goto free_cs_chunk_array;
2050                 }
2051
2052                 collective_engine_id = chunk->collective_engine_id;
2053         }
2054
2055         is_wait_cs = !!(cs_type == CS_TYPE_WAIT ||
2056                         cs_type == CS_TYPE_COLLECTIVE_WAIT);
2057
2058         cs_encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
2059
2060         if (is_wait_cs) {
2061                 rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq,
2062                                 ctx, cs_encaps_signals);
2063                 if (rc)
2064                         goto free_cs_chunk_array;
2065
2066                 if (cs_encaps_signals) {
2067                         /* check if cs sequence has encapsulated
2068                          * signals handle
2069                          */
2070                         struct idr *idp;
2071                         u32 id;
2072
2073                         spin_lock(&ctx->sig_mgr.lock);
2074                         idp = &ctx->sig_mgr.handles;
2075                         idr_for_each_entry(idp, encaps_sig_hdl, id) {
2076                                 if (encaps_sig_hdl->cs_seq == signal_seq) {
2077                                         /* get refcount to protect removing this handle from idr,
2078                                          * needed when multiple wait cs are used with offset
2079                                          * to wait on reserved encaps signals.
2080                                          * Since kref_put of this handle is executed outside the
2081                                          * current lock, it is possible that the handle refcount
2082                                          * is 0 but it yet to be removed from the list. In this
2083                                          * case need to consider the handle as not valid.
2084                                          */
2085                                         if (kref_get_unless_zero(&encaps_sig_hdl->refcount))
2086                                                 handle_found = true;
2087                                         break;
2088                                 }
2089                         }
2090                         spin_unlock(&ctx->sig_mgr.lock);
2091
2092                         if (!handle_found) {
2093                                 /* treat as signal CS already finished */
2094                                 dev_dbg(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n",
2095                                                 signal_seq);
2096                                 rc = 0;
2097                                 goto free_cs_chunk_array;
2098                         }
2099
2100                         /* validate also the signal offset value */
2101                         if (chunk->encaps_signal_offset >
2102                                         encaps_sig_hdl->count) {
2103                                 dev_err(hdev->dev, "offset(%u) value exceed max reserved signals count(%u)!\n",
2104                                                 chunk->encaps_signal_offset,
2105                                                 encaps_sig_hdl->count);
2106                                 rc = -EINVAL;
2107                                 goto free_cs_chunk_array;
2108                         }
2109                 }
2110
2111                 sig_fence = hl_ctx_get_fence(ctx, signal_seq);
2112                 if (IS_ERR(sig_fence)) {
2113                         atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2114                         atomic64_inc(&cntr->validation_drop_cnt);
2115                         dev_err(hdev->dev,
2116                                 "Failed to get signal CS with seq 0x%llx\n",
2117                                 signal_seq);
2118                         rc = PTR_ERR(sig_fence);
2119                         goto free_cs_chunk_array;
2120                 }
2121
2122                 if (!sig_fence) {
2123                         /* signal CS already finished */
2124                         rc = 0;
2125                         goto free_cs_chunk_array;
2126                 }
2127
2128                 sig_waitcs_cmpl =
2129                         container_of(sig_fence, struct hl_cs_compl, base_fence);
2130
2131                 staged_cs_with_encaps_signals = !!
2132                                 (sig_waitcs_cmpl->type == CS_TYPE_DEFAULT &&
2133                                 (flags & HL_CS_FLAGS_ENCAP_SIGNALS));
2134
2135                 if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL &&
2136                                 !staged_cs_with_encaps_signals) {
2137                         atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2138                         atomic64_inc(&cntr->validation_drop_cnt);
2139                         dev_err(hdev->dev,
2140                                 "CS seq 0x%llx is not of a signal/encaps-signal CS\n",
2141                                 signal_seq);
2142                         hl_fence_put(sig_fence);
2143                         rc = -EINVAL;
2144                         goto free_cs_chunk_array;
2145                 }
2146
2147                 if (completion_done(&sig_fence->completion)) {
2148                         /* signal CS already finished */
2149                         hl_fence_put(sig_fence);
2150                         rc = 0;
2151                         goto free_cs_chunk_array;
2152                 }
2153         }
2154
2155         rc = allocate_cs(hdev, ctx, cs_type, ULLONG_MAX, &cs, flags, timeout);
2156         if (rc) {
2157                 if (is_wait_cs)
2158                         hl_fence_put(sig_fence);
2159
2160                 goto free_cs_chunk_array;
2161         }
2162
2163         /*
2164          * Save the signal CS fence for later initialization right before
2165          * hanging the wait CS on the queue.
2166          * for encaps signals case, we save the cs sequence and handle pointer
2167          * for later initialization.
2168          */
2169         if (is_wait_cs) {
2170                 cs->signal_fence = sig_fence;
2171                 /* store the handle pointer, so we don't have to
2172                  * look for it again, later on the flow
2173                  * when we need to set SOB info in hw_queue.
2174                  */
2175                 if (cs->encaps_signals)
2176                         cs->encaps_sig_hdl = encaps_sig_hdl;
2177         }
2178
2179         hl_debugfs_add_cs(cs);
2180
2181         *cs_seq = cs->sequence;
2182
2183         if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL)
2184                 rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type,
2185                                 q_idx, chunk->encaps_signal_offset);
2186         else if (cs_type == CS_TYPE_COLLECTIVE_WAIT)
2187                 rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
2188                                 cs, q_idx, collective_engine_id,
2189                                 chunk->encaps_signal_offset);
2190         else {
2191                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2192                 atomic64_inc(&cntr->validation_drop_cnt);
2193                 rc = -EINVAL;
2194         }
2195
2196         if (rc)
2197                 goto free_cs_object;
2198
2199         rc = hl_hw_queue_schedule_cs(cs);
2200         if (rc) {
2201                 /* In case wait cs failed here, it means the signal cs
2202                  * already completed. we want to free all it's related objects
2203                  * but we don't want to fail the ioctl.
2204                  */
2205                 if (is_wait_cs)
2206                         rc = 0;
2207                 else if (rc != -EAGAIN)
2208                         dev_err(hdev->dev,
2209                                 "Failed to submit CS %d.%llu to H/W queues, error %d\n",
2210                                 ctx->asid, cs->sequence, rc);
2211                 goto free_cs_object;
2212         }
2213
2214         *signal_sob_addr_offset = cs->sob_addr_offset;
2215         *signal_initial_sob_count = cs->initial_sob_count;
2216
2217         rc = HL_CS_STATUS_SUCCESS;
2218         if (is_wait_cs)
2219                 wait_cs_submitted = true;
2220         goto put_cs;
2221
2222 free_cs_object:
2223         cs_rollback(hdev, cs);
2224         *cs_seq = ULLONG_MAX;
2225         /* The path below is both for good and erroneous exits */
2226 put_cs:
2227         /* We finished with the CS in this function, so put the ref */
2228         cs_put(cs);
2229 free_cs_chunk_array:
2230         if (!wait_cs_submitted && cs_encaps_signals && handle_found &&
2231                                                         is_wait_cs)
2232                 kref_put(&encaps_sig_hdl->refcount,
2233                                 hl_encaps_handle_do_release);
2234         kfree(cs_chunk_array);
2235 out:
2236         return rc;
2237 }
2238
2239 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
2240 {
2241         union hl_cs_args *args = data;
2242         enum hl_cs_type cs_type = 0;
2243         u64 cs_seq = ULONG_MAX;
2244         void __user *chunks;
2245         u32 num_chunks, flags, timeout,
2246                 signals_count = 0, sob_addr = 0, handle_id = 0;
2247         u16 sob_initial_count = 0;
2248         int rc;
2249
2250         rc = hl_cs_sanity_checks(hpriv, args);
2251         if (rc)
2252                 goto out;
2253
2254         rc = hl_cs_ctx_switch(hpriv, args, &cs_seq);
2255         if (rc)
2256                 goto out;
2257
2258         cs_type = hl_cs_get_cs_type(args->in.cs_flags &
2259                                         ~HL_CS_FLAGS_FORCE_RESTORE);
2260         chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
2261         num_chunks = args->in.num_chunks_execute;
2262         flags = args->in.cs_flags;
2263
2264         /* In case this is a staged CS, user should supply the CS sequence */
2265         if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
2266                         !(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
2267                 cs_seq = args->in.seq;
2268
2269         timeout = flags & HL_CS_FLAGS_CUSTOM_TIMEOUT
2270                         ? msecs_to_jiffies(args->in.timeout * 1000)
2271                         : hpriv->hdev->timeout_jiffies;
2272
2273         switch (cs_type) {
2274         case CS_TYPE_SIGNAL:
2275         case CS_TYPE_WAIT:
2276         case CS_TYPE_COLLECTIVE_WAIT:
2277                 rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
2278                                         &cs_seq, args->in.cs_flags, timeout,
2279                                         &sob_addr, &sob_initial_count);
2280                 break;
2281         case CS_RESERVE_SIGNALS:
2282                 rc = cs_ioctl_reserve_signals(hpriv,
2283                                         args->in.encaps_signals_q_idx,
2284                                         args->in.encaps_signals_count,
2285                                         &handle_id, &sob_addr, &signals_count);
2286                 break;
2287         case CS_UNRESERVE_SIGNALS:
2288                 rc = cs_ioctl_unreserve_signals(hpriv,
2289                                         args->in.encaps_sig_handle_id);
2290                 break;
2291         default:
2292                 rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
2293                                                 args->in.cs_flags,
2294                                                 args->in.encaps_sig_handle_id,
2295                                                 timeout, &sob_initial_count);
2296                 break;
2297         }
2298 out:
2299         if (rc != -EAGAIN) {
2300                 memset(args, 0, sizeof(*args));
2301
2302                 switch (cs_type) {
2303                 case CS_RESERVE_SIGNALS:
2304                         args->out.handle_id = handle_id;
2305                         args->out.sob_base_addr_offset = sob_addr;
2306                         args->out.count = signals_count;
2307                         break;
2308                 case CS_TYPE_SIGNAL:
2309                         args->out.sob_base_addr_offset = sob_addr;
2310                         args->out.sob_count_before_submission = sob_initial_count;
2311                         args->out.seq = cs_seq;
2312                         break;
2313                 case CS_TYPE_DEFAULT:
2314                         args->out.sob_count_before_submission = sob_initial_count;
2315                         args->out.seq = cs_seq;
2316                         break;
2317                 default:
2318                         args->out.seq = cs_seq;
2319                         break;
2320                 }
2321
2322                 args->out.status = rc;
2323         }
2324
2325         return rc;
2326 }
2327
2328 static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence,
2329                                 enum hl_cs_wait_status *status, u64 timeout_us,
2330                                 s64 *timestamp)
2331 {
2332         struct hl_device *hdev = ctx->hdev;
2333         long completion_rc;
2334         int rc = 0;
2335
2336         if (IS_ERR(fence)) {
2337                 rc = PTR_ERR(fence);
2338                 if (rc == -EINVAL)
2339                         dev_notice_ratelimited(hdev->dev,
2340                                 "Can't wait on CS %llu because current CS is at seq %llu\n",
2341                                 seq, ctx->cs_sequence);
2342                 return rc;
2343         }
2344
2345         if (!fence) {
2346                 dev_dbg(hdev->dev,
2347                         "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
2348                                 seq, ctx->cs_sequence);
2349
2350                 *status = CS_WAIT_STATUS_GONE;
2351                 return 0;
2352         }
2353
2354         if (!timeout_us) {
2355                 completion_rc = completion_done(&fence->completion);
2356         } else {
2357                 unsigned long timeout;
2358
2359                 timeout = (timeout_us == MAX_SCHEDULE_TIMEOUT) ?
2360                                 timeout_us : usecs_to_jiffies(timeout_us);
2361                 completion_rc =
2362                         wait_for_completion_interruptible_timeout(
2363                                 &fence->completion, timeout);
2364         }
2365
2366         if (completion_rc > 0) {
2367                 *status = CS_WAIT_STATUS_COMPLETED;
2368                 if (timestamp)
2369                         *timestamp = ktime_to_ns(fence->timestamp);
2370         } else {
2371                 *status = CS_WAIT_STATUS_BUSY;
2372         }
2373
2374         if (fence->error == -ETIMEDOUT)
2375                 rc = -ETIMEDOUT;
2376         else if (fence->error == -EIO)
2377                 rc = -EIO;
2378
2379         return rc;
2380 }
2381
2382 /*
2383  * hl_cs_poll_fences - iterate CS fences to check for CS completion
2384  *
2385  * @mcs_data: multi-CS internal data
2386  * @mcs_compl: multi-CS completion structure
2387  *
2388  * @return 0 on success, otherwise non 0 error code
2389  *
2390  * The function iterates on all CS sequence in the list and set bit in
2391  * completion_bitmap for each completed CS.
2392  * While iterating, the function sets the stream map of each fence in the fence
2393  * array in the completion QID stream map to be used by CSs to perform
2394  * completion to the multi-CS context.
2395  * This function shall be called after taking context ref
2396  */
2397 static int hl_cs_poll_fences(struct multi_cs_data *mcs_data, struct multi_cs_completion *mcs_compl)
2398 {
2399         struct hl_fence **fence_ptr = mcs_data->fence_arr;
2400         struct hl_device *hdev = mcs_data->ctx->hdev;
2401         int i, rc, arr_len = mcs_data->arr_len;
2402         u64 *seq_arr = mcs_data->seq_arr;
2403         ktime_t max_ktime, first_cs_time;
2404         enum hl_cs_wait_status status;
2405
2406         memset(fence_ptr, 0, arr_len * sizeof(*fence_ptr));
2407
2408         /* get all fences under the same lock */
2409         rc = hl_ctx_get_fences(mcs_data->ctx, seq_arr, fence_ptr, arr_len);
2410         if (rc)
2411                 return rc;
2412
2413         /*
2414          * re-initialize the completion here to handle 2 possible cases:
2415          * 1. CS will complete the multi-CS prior clearing the completion. in which
2416          *    case the fence iteration is guaranteed to catch the CS completion.
2417          * 2. the completion will occur after re-init of the completion.
2418          *    in which case we will wake up immediately in wait_for_completion.
2419          */
2420         reinit_completion(&mcs_compl->completion);
2421
2422         /*
2423          * set to maximum time to verify timestamp is valid: if at the end
2424          * this value is maintained- no timestamp was updated
2425          */
2426         max_ktime = ktime_set(KTIME_SEC_MAX, 0);
2427         first_cs_time = max_ktime;
2428
2429         for (i = 0; i < arr_len; i++, fence_ptr++) {
2430                 struct hl_fence *fence = *fence_ptr;
2431
2432                 /*
2433                  * In order to prevent case where we wait until timeout even though a CS associated
2434                  * with the multi-CS actually completed we do things in the below order:
2435                  * 1. for each fence set it's QID map in the multi-CS completion QID map. This way
2436                  *    any CS can, potentially, complete the multi CS for the specific QID (note
2437                  *    that once completion is initialized, calling complete* and then wait on the
2438                  *    completion will cause it to return at once)
2439                  * 2. only after allowing multi-CS completion for the specific QID we check whether
2440                  *    the specific CS already completed (and thus the wait for completion part will
2441                  *    be skipped). if the CS not completed it is guaranteed that completing CS will
2442                  *    wake up the completion.
2443                  */
2444                 if (fence)
2445                         mcs_compl->stream_master_qid_map |= fence->stream_master_qid_map;
2446
2447                 /*
2448                  * function won't sleep as it is called with timeout 0 (i.e.
2449                  * poll the fence)
2450                  */
2451                 rc = hl_wait_for_fence(mcs_data->ctx, seq_arr[i], fence,
2452                                                 &status, 0, NULL);
2453                 if (rc) {
2454                         dev_err(hdev->dev,
2455                                 "wait_for_fence error :%d for CS seq %llu\n",
2456                                                                 rc, seq_arr[i]);
2457                         break;
2458                 }
2459
2460                 switch (status) {
2461                 case CS_WAIT_STATUS_BUSY:
2462                         /* CS did not finished, QID to wait on already stored */
2463                         break;
2464                 case CS_WAIT_STATUS_COMPLETED:
2465                         /*
2466                          * Using mcs_handling_done to avoid possibility of mcs_data
2467                          * returns to user indicating CS completed before it finished
2468                          * all of its mcs handling, to avoid race the next time the
2469                          * user waits for mcs.
2470                          * note: when reaching this case fence is definitely not NULL
2471                          *       but NULL check was added to overcome static analysis
2472                          */
2473                         if (fence && !fence->mcs_handling_done) {
2474                                 /*
2475                                  * in case multi CS is completed but MCS handling not done
2476                                  * we "complete" the multi CS to prevent it from waiting
2477                                  * until time-out and the "multi-CS handling done" will have
2478                                  * another chance at the next iteration
2479                                  */
2480                                 complete_all(&mcs_compl->completion);
2481                                 break;
2482                         }
2483
2484                         mcs_data->completion_bitmap |= BIT(i);
2485                         /*
2486                          * For all completed CSs we take the earliest timestamp.
2487                          * For this we have to validate that the timestamp is
2488                          * earliest of all timestamps so far.
2489                          */
2490                         if (mcs_data->update_ts &&
2491                                         (ktime_compare(fence->timestamp, first_cs_time) < 0))
2492                                 first_cs_time = fence->timestamp;
2493                         break;
2494                 case CS_WAIT_STATUS_GONE:
2495                         mcs_data->update_ts = false;
2496                         mcs_data->gone_cs = true;
2497                         /*
2498                          * It is possible to get an old sequence numbers from user
2499                          * which related to already completed CSs and their fences
2500                          * already gone. In this case, CS set as completed but
2501                          * no need to consider its QID for mcs completion.
2502                          */
2503                         mcs_data->completion_bitmap |= BIT(i);
2504                         break;
2505                 default:
2506                         dev_err(hdev->dev, "Invalid fence status\n");
2507                         return -EINVAL;
2508                 }
2509
2510         }
2511
2512         hl_fences_put(mcs_data->fence_arr, arr_len);
2513
2514         if (mcs_data->update_ts &&
2515                         (ktime_compare(first_cs_time, max_ktime) != 0))
2516                 mcs_data->timestamp = ktime_to_ns(first_cs_time);
2517
2518         return rc;
2519 }
2520
2521 static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
2522                                 u64 timeout_us, u64 seq,
2523                                 enum hl_cs_wait_status *status, s64 *timestamp)
2524 {
2525         struct hl_fence *fence;
2526         int rc = 0;
2527
2528         if (timestamp)
2529                 *timestamp = 0;
2530
2531         hl_ctx_get(hdev, ctx);
2532
2533         fence = hl_ctx_get_fence(ctx, seq);
2534
2535         rc = hl_wait_for_fence(ctx, seq, fence, status, timeout_us, timestamp);
2536         hl_fence_put(fence);
2537         hl_ctx_put(ctx);
2538
2539         return rc;
2540 }
2541
2542 static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs)
2543 {
2544         if (usecs <= U32_MAX)
2545                 return usecs_to_jiffies(usecs);
2546
2547         /*
2548          * If the value in nanoseconds is larger than 64 bit, use the largest
2549          * 64 bit value.
2550          */
2551         if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC)))
2552                 return nsecs_to_jiffies(U64_MAX);
2553
2554         return nsecs_to_jiffies(usecs * NSEC_PER_USEC);
2555 }
2556
2557 /*
2558  * hl_wait_multi_cs_completion_init - init completion structure
2559  *
2560  * @hdev: pointer to habanalabs device structure
2561  * @stream_master_bitmap: stream master QIDs map, set bit indicates stream
2562  *                        master QID to wait on
2563  *
2564  * @return valid completion struct pointer on success, otherwise error pointer
2565  *
2566  * up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver.
2567  * the function gets the first available completion (by marking it "used")
2568  * and initialize its values.
2569  */
2570 static struct multi_cs_completion *hl_wait_multi_cs_completion_init(struct hl_device *hdev)
2571 {
2572         struct multi_cs_completion *mcs_compl;
2573         int i;
2574
2575         /* find free multi_cs completion structure */
2576         for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
2577                 mcs_compl = &hdev->multi_cs_completion[i];
2578                 spin_lock(&mcs_compl->lock);
2579                 if (!mcs_compl->used) {
2580                         mcs_compl->used = 1;
2581                         mcs_compl->timestamp = 0;
2582                         /*
2583                          * init QID map to 0 to avoid completion by CSs. the actual QID map
2584                          * to multi-CS CSs will be set incrementally at a later stage
2585                          */
2586                         mcs_compl->stream_master_qid_map = 0;
2587                         spin_unlock(&mcs_compl->lock);
2588                         break;
2589                 }
2590                 spin_unlock(&mcs_compl->lock);
2591         }
2592
2593         if (i == MULTI_CS_MAX_USER_CTX) {
2594                 dev_err(hdev->dev, "no available multi-CS completion structure\n");
2595                 return ERR_PTR(-ENOMEM);
2596         }
2597         return mcs_compl;
2598 }
2599
2600 /*
2601  * hl_wait_multi_cs_completion_fini - return completion structure and set as
2602  *                                    unused
2603  *
2604  * @mcs_compl: pointer to the completion structure
2605  */
2606 static void hl_wait_multi_cs_completion_fini(
2607                                         struct multi_cs_completion *mcs_compl)
2608 {
2609         /*
2610          * free completion structure, do it under lock to be in-sync with the
2611          * thread that signals completion
2612          */
2613         spin_lock(&mcs_compl->lock);
2614         mcs_compl->used = 0;
2615         spin_unlock(&mcs_compl->lock);
2616 }
2617
2618 /*
2619  * hl_wait_multi_cs_completion - wait for first CS to complete
2620  *
2621  * @mcs_data: multi-CS internal data
2622  *
2623  * @return 0 on success, otherwise non 0 error code
2624  */
2625 static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data,
2626                                                 struct multi_cs_completion *mcs_compl)
2627 {
2628         long completion_rc;
2629
2630         completion_rc = wait_for_completion_interruptible_timeout(&mcs_compl->completion,
2631                                                                         mcs_data->timeout_jiffies);
2632
2633         /* update timestamp */
2634         if (completion_rc > 0)
2635                 mcs_data->timestamp = mcs_compl->timestamp;
2636
2637         mcs_data->wait_status = completion_rc;
2638
2639         return 0;
2640 }
2641
2642 /*
2643  * hl_multi_cs_completion_init - init array of multi-CS completion structures
2644  *
2645  * @hdev: pointer to habanalabs device structure
2646  */
2647 void hl_multi_cs_completion_init(struct hl_device *hdev)
2648 {
2649         struct multi_cs_completion *mcs_cmpl;
2650         int i;
2651
2652         for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
2653                 mcs_cmpl = &hdev->multi_cs_completion[i];
2654                 mcs_cmpl->used = 0;
2655                 spin_lock_init(&mcs_cmpl->lock);
2656                 init_completion(&mcs_cmpl->completion);
2657         }
2658 }
2659
2660 /*
2661  * hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl
2662  *
2663  * @hpriv: pointer to the private data of the fd
2664  * @data: pointer to multi-CS wait ioctl in/out args
2665  *
2666  */
2667 static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
2668 {
2669         struct multi_cs_completion *mcs_compl;
2670         struct hl_device *hdev = hpriv->hdev;
2671         struct multi_cs_data mcs_data = {0};
2672         union hl_wait_cs_args *args = data;
2673         struct hl_ctx *ctx = hpriv->ctx;
2674         struct hl_fence **fence_arr;
2675         void __user *seq_arr;
2676         u32 size_to_copy;
2677         u64 *cs_seq_arr;
2678         u8 seq_arr_len;
2679         int rc;
2680
2681         if (!hdev->supports_wait_for_multi_cs) {
2682                 dev_err(hdev->dev, "Wait for multi CS is not supported\n");
2683                 return -EPERM;
2684         }
2685
2686         seq_arr_len = args->in.seq_arr_len;
2687
2688         if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) {
2689                 dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n",
2690                                 HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len);
2691                 return -EINVAL;
2692         }
2693
2694         /* allocate memory for sequence array */
2695         cs_seq_arr =
2696                 kmalloc_array(seq_arr_len, sizeof(*cs_seq_arr), GFP_KERNEL);
2697         if (!cs_seq_arr)
2698                 return -ENOMEM;
2699
2700         /* copy CS sequence array from user */
2701         seq_arr = (void __user *) (uintptr_t) args->in.seq;
2702         size_to_copy = seq_arr_len * sizeof(*cs_seq_arr);
2703         if (copy_from_user(cs_seq_arr, seq_arr, size_to_copy)) {
2704                 dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n");
2705                 rc = -EFAULT;
2706                 goto free_seq_arr;
2707         }
2708
2709         /* allocate array for the fences */
2710         fence_arr = kmalloc_array(seq_arr_len, sizeof(*fence_arr), GFP_KERNEL);
2711         if (!fence_arr) {
2712                 rc = -ENOMEM;
2713                 goto free_seq_arr;
2714         }
2715
2716         /* initialize the multi-CS internal data */
2717         mcs_data.ctx = ctx;
2718         mcs_data.seq_arr = cs_seq_arr;
2719         mcs_data.fence_arr = fence_arr;
2720         mcs_data.arr_len = seq_arr_len;
2721
2722         hl_ctx_get(hdev, ctx);
2723
2724         /* wait (with timeout) for the first CS to be completed */
2725         mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us);
2726         mcs_compl = hl_wait_multi_cs_completion_init(hdev);
2727         if (IS_ERR(mcs_compl)) {
2728                 rc = PTR_ERR(mcs_compl);
2729                 goto put_ctx;
2730         }
2731
2732         /* poll all CS fences, extract timestamp */
2733         mcs_data.update_ts = true;
2734         rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
2735         /*
2736          * skip wait for CS completion when one of the below is true:
2737          * - an error on the poll function
2738          * - one or more CS in the list completed
2739          * - the user called ioctl with timeout 0
2740          */
2741         if (rc || mcs_data.completion_bitmap || !args->in.timeout_us)
2742                 goto completion_fini;
2743
2744         while (true) {
2745                 rc = hl_wait_multi_cs_completion(&mcs_data, mcs_compl);
2746                 if (rc || (mcs_data.wait_status == 0))
2747                         break;
2748
2749                 /*
2750                  * poll fences once again to update the CS map.
2751                  * no timestamp should be updated this time.
2752                  */
2753                 mcs_data.update_ts = false;
2754                 rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
2755
2756                 if (rc || mcs_data.completion_bitmap)
2757                         break;
2758
2759                 /*
2760                  * if hl_wait_multi_cs_completion returned before timeout (i.e.
2761                  * it got a completion) it either got completed by CS in the multi CS list
2762                  * (in which case the indication will be non empty completion_bitmap) or it
2763                  * got completed by CS submitted to one of the shared stream master but
2764                  * not in the multi CS list (in which case we should wait again but modify
2765                  * the timeout and set timestamp as zero to let a CS related to the current
2766                  * multi-CS set a new, relevant, timestamp)
2767                  */
2768                 mcs_data.timeout_jiffies = mcs_data.wait_status;
2769                 mcs_compl->timestamp = 0;
2770         }
2771
2772 completion_fini:
2773         hl_wait_multi_cs_completion_fini(mcs_compl);
2774
2775 put_ctx:
2776         hl_ctx_put(ctx);
2777         kfree(fence_arr);
2778
2779 free_seq_arr:
2780         kfree(cs_seq_arr);
2781
2782         if (rc)
2783                 return rc;
2784
2785         if (mcs_data.wait_status == -ERESTARTSYS) {
2786                 dev_err_ratelimited(hdev->dev,
2787                                 "user process got signal while waiting for Multi-CS\n");
2788                 return -EINTR;
2789         }
2790
2791         /* update output args */
2792         memset(args, 0, sizeof(*args));
2793
2794         if (mcs_data.completion_bitmap) {
2795                 args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
2796                 args->out.cs_completion_map = mcs_data.completion_bitmap;
2797
2798                 /* if timestamp not 0- it's valid */
2799                 if (mcs_data.timestamp) {
2800                         args->out.timestamp_nsec = mcs_data.timestamp;
2801                         args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
2802                 }
2803
2804                 /* update if some CS was gone */
2805                 if (!mcs_data.timestamp)
2806                         args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
2807         } else {
2808                 args->out.status = HL_WAIT_CS_STATUS_BUSY;
2809         }
2810
2811         return 0;
2812 }
2813
2814 static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
2815 {
2816         struct hl_device *hdev = hpriv->hdev;
2817         union hl_wait_cs_args *args = data;
2818         enum hl_cs_wait_status status;
2819         u64 seq = args->in.seq;
2820         s64 timestamp;
2821         int rc;
2822
2823         rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq,
2824                                 &status, &timestamp);
2825
2826         if (rc == -ERESTARTSYS) {
2827                 dev_err_ratelimited(hdev->dev,
2828                         "user process got signal while waiting for CS handle %llu\n",
2829                         seq);
2830                 return -EINTR;
2831         }
2832
2833         memset(args, 0, sizeof(*args));
2834
2835         if (rc) {
2836                 if (rc == -ETIMEDOUT) {
2837                         dev_err_ratelimited(hdev->dev,
2838                                 "CS %llu has timed-out while user process is waiting for it\n",
2839                                 seq);
2840                         args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
2841                 } else if (rc == -EIO) {
2842                         dev_err_ratelimited(hdev->dev,
2843                                 "CS %llu has been aborted while user process is waiting for it\n",
2844                                 seq);
2845                         args->out.status = HL_WAIT_CS_STATUS_ABORTED;
2846                 }
2847                 return rc;
2848         }
2849
2850         if (timestamp) {
2851                 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
2852                 args->out.timestamp_nsec = timestamp;
2853         }
2854
2855         switch (status) {
2856         case CS_WAIT_STATUS_GONE:
2857                 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
2858                 fallthrough;
2859         case CS_WAIT_STATUS_COMPLETED:
2860                 args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
2861                 break;
2862         case CS_WAIT_STATUS_BUSY:
2863         default:
2864                 args->out.status = HL_WAIT_CS_STATUS_BUSY;
2865                 break;
2866         }
2867
2868         return 0;
2869 }
2870
2871 static int ts_buff_get_kernel_ts_record(struct hl_ts_buff *ts_buff,
2872                                         struct hl_cb *cq_cb,
2873                                         u64 ts_offset, u64 cq_offset, u64 target_value,
2874                                         spinlock_t *wait_list_lock,
2875                                         struct hl_user_pending_interrupt **pend)
2876 {
2877         struct hl_user_pending_interrupt *requested_offset_record =
2878                                 (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
2879                                 ts_offset;
2880         struct hl_user_pending_interrupt *cb_last =
2881                         (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
2882                         (ts_buff->kernel_buff_size / sizeof(struct hl_user_pending_interrupt));
2883         unsigned long flags, iter_counter = 0;
2884         u64 current_cq_counter;
2885
2886         /* Validate ts_offset not exceeding last max */
2887         if (requested_offset_record > cb_last) {
2888                 dev_err(ts_buff->hdev->dev, "Ts offset exceeds max CB offset(0x%llx)\n",
2889                                                                 (u64)(uintptr_t)cb_last);
2890                 return -EINVAL;
2891         }
2892
2893 start_over:
2894         spin_lock_irqsave(wait_list_lock, flags);
2895
2896         /* Unregister only if we didn't reach the target value
2897          * since in this case there will be no handling in irq context
2898          * and then it's safe to delete the node out of the interrupt list
2899          * then re-use it on other interrupt
2900          */
2901         if (requested_offset_record->ts_reg_info.in_use) {
2902                 current_cq_counter = *requested_offset_record->cq_kernel_addr;
2903                 if (current_cq_counter < requested_offset_record->cq_target_value) {
2904                         list_del(&requested_offset_record->wait_list_node);
2905                         spin_unlock_irqrestore(wait_list_lock, flags);
2906
2907                         hl_ts_put(requested_offset_record->ts_reg_info.ts_buff);
2908                         hl_cb_put(requested_offset_record->ts_reg_info.cq_cb);
2909
2910                         dev_dbg(ts_buff->hdev->dev, "ts node removed from interrupt list now can re-use\n");
2911                 } else {
2912                         dev_dbg(ts_buff->hdev->dev, "ts node in middle of irq handling\n");
2913
2914                         /* irq handling in the middle give it time to finish */
2915                         spin_unlock_irqrestore(wait_list_lock, flags);
2916                         usleep_range(1, 10);
2917                         if (++iter_counter == MAX_TS_ITER_NUM) {
2918                                 dev_err(ts_buff->hdev->dev, "handling registration interrupt took too long!!\n");
2919                                 return -EINVAL;
2920                         }
2921
2922                         goto start_over;
2923                 }
2924         } else {
2925                 spin_unlock_irqrestore(wait_list_lock, flags);
2926         }
2927
2928         /* Fill up the new registration node info */
2929         requested_offset_record->ts_reg_info.in_use = 1;
2930         requested_offset_record->ts_reg_info.ts_buff = ts_buff;
2931         requested_offset_record->ts_reg_info.cq_cb = cq_cb;
2932         requested_offset_record->ts_reg_info.timestamp_kernel_addr =
2933                         (u64 *) ts_buff->user_buff_address + ts_offset;
2934         requested_offset_record->cq_kernel_addr =
2935                         (u64 *) cq_cb->kernel_address + cq_offset;
2936         requested_offset_record->cq_target_value = target_value;
2937
2938         *pend = requested_offset_record;
2939
2940         dev_dbg(ts_buff->hdev->dev, "Found available node in TS kernel CB(0x%llx)\n",
2941                                                 (u64)(uintptr_t)requested_offset_record);
2942         return 0;
2943 }
2944
2945 static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
2946                                 struct hl_cb_mgr *cb_mgr, struct hl_ts_mgr *ts_mgr,
2947                                 u64 timeout_us, u64 cq_counters_handle, u64 cq_counters_offset,
2948                                 u64 target_value, struct hl_user_interrupt *interrupt,
2949                                 bool register_ts_record, u64 ts_handle, u64 ts_offset,
2950                                 u32 *status, u64 *timestamp)
2951 {
2952         u32 cq_patched_handle, ts_patched_handle;
2953         struct hl_user_pending_interrupt *pend;
2954         struct hl_ts_buff *ts_buff;
2955         struct hl_cb *cq_cb;
2956         unsigned long timeout, flags;
2957         long completion_rc;
2958         int rc = 0;
2959
2960         timeout = hl_usecs64_to_jiffies(timeout_us);
2961
2962         hl_ctx_get(hdev, ctx);
2963
2964         cq_patched_handle = lower_32_bits(cq_counters_handle >> PAGE_SHIFT);
2965         cq_cb = hl_cb_get(hdev, cb_mgr, cq_patched_handle);
2966         if (!cq_cb) {
2967                 rc = -EINVAL;
2968                 goto put_ctx;
2969         }
2970
2971         if (register_ts_record) {
2972                 dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, ts offset: %llu, cq_offset: %llu\n",
2973                                         interrupt->interrupt_id, ts_offset, cq_counters_offset);
2974
2975                 ts_patched_handle = lower_32_bits(ts_handle >> PAGE_SHIFT);
2976                 ts_buff = hl_ts_get(hdev, ts_mgr, ts_patched_handle);
2977                 if (!ts_buff) {
2978                         rc = -EINVAL;
2979                         goto put_cq_cb;
2980                 }
2981
2982                 /* Find first available record */
2983                 rc = ts_buff_get_kernel_ts_record(ts_buff, cq_cb, ts_offset,
2984                                                 cq_counters_offset, target_value,
2985                                                 &interrupt->wait_list_lock, &pend);
2986                 if (rc)
2987                         goto put_ts_buff;
2988         } else {
2989                 pend = kzalloc(sizeof(*pend), GFP_KERNEL);
2990                 if (!pend) {
2991                         rc = -ENOMEM;
2992                         goto put_cq_cb;
2993                 }
2994                 hl_fence_init(&pend->fence, ULONG_MAX);
2995                 pend->cq_kernel_addr = (u64 *) cq_cb->kernel_address + cq_counters_offset;
2996                 pend->cq_target_value = target_value;
2997         }
2998
2999         spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3000
3001         /* We check for completion value as interrupt could have been received
3002          * before we added the node to the wait list
3003          */
3004         if (*pend->cq_kernel_addr >= target_value) {
3005                 if (register_ts_record)
3006                         pend->ts_reg_info.in_use = 0;
3007                 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3008
3009                 *status = HL_WAIT_CS_STATUS_COMPLETED;
3010
3011                 if (register_ts_record) {
3012                         *pend->ts_reg_info.timestamp_kernel_addr = ktime_get_ns();
3013                         goto put_ts_buff;
3014                 } else {
3015                         pend->fence.timestamp = ktime_get();
3016                         goto set_timestamp;
3017                 }
3018         } else if (!timeout_us) {
3019                 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3020                 *status = HL_WAIT_CS_STATUS_BUSY;
3021                 pend->fence.timestamp = ktime_get();
3022                 goto set_timestamp;
3023         }
3024
3025         /* Add pending user interrupt to relevant list for the interrupt
3026          * handler to monitor.
3027          * Note that we cannot have sorted list by target value,
3028          * in order to shorten the list pass loop, since
3029          * same list could have nodes for different cq counter handle.
3030          */
3031         list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
3032         spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3033
3034         if (register_ts_record) {
3035                 rc = *status = HL_WAIT_CS_STATUS_COMPLETED;
3036                 goto ts_registration_exit;
3037         }
3038
3039         /* Wait for interrupt handler to signal completion */
3040         completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
3041                                                                 timeout);
3042         if (completion_rc > 0) {
3043                 *status = HL_WAIT_CS_STATUS_COMPLETED;
3044         } else {
3045                 if (completion_rc == -ERESTARTSYS) {
3046                         dev_err_ratelimited(hdev->dev,
3047                                         "user process got signal while waiting for interrupt ID %d\n",
3048                                         interrupt->interrupt_id);
3049                         rc = -EINTR;
3050                         *status = HL_WAIT_CS_STATUS_ABORTED;
3051                 } else {
3052                         if (pend->fence.error == -EIO) {
3053                                 dev_err_ratelimited(hdev->dev,
3054                                                 "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
3055                                                 pend->fence.error);
3056                                 rc = -EIO;
3057                                 *status = HL_WAIT_CS_STATUS_ABORTED;
3058                         } else {
3059                                 /* The wait has timed-out. We don't know anything beyond that
3060                                  * because the workload wasn't submitted through the driver.
3061                                  * Therefore, from driver's perspective, the workload is still
3062                                  * executing.
3063                                  */
3064                                 rc = 0;
3065                                 *status = HL_WAIT_CS_STATUS_BUSY;
3066                         }
3067                 }
3068         }
3069
3070         /*
3071          * We keep removing the node from list here, and not at the irq handler
3072          * for completion timeout case. and if it's a registration
3073          * for ts record, the node will be deleted in the irq handler after
3074          * we reach the target value.
3075          */
3076         spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3077         list_del(&pend->wait_list_node);
3078         spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3079
3080 set_timestamp:
3081         *timestamp = ktime_to_ns(pend->fence.timestamp);
3082         kfree(pend);
3083         hl_cb_put(cq_cb);
3084 ts_registration_exit:
3085         hl_ctx_put(ctx);
3086
3087         return rc;
3088
3089 put_ts_buff:
3090         hl_ts_put(ts_buff);
3091 put_cq_cb:
3092         hl_cb_put(cq_cb);
3093 put_ctx:
3094         hl_ctx_put(ctx);
3095
3096         return rc;
3097 }
3098
3099 static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_ctx *ctx,
3100                                 u64 timeout_us, u64 user_address,
3101                                 u64 target_value, struct hl_user_interrupt *interrupt,
3102
3103                                 u32 *status,
3104                                 u64 *timestamp)
3105 {
3106         struct hl_user_pending_interrupt *pend;
3107         unsigned long timeout, flags;
3108         u64 completion_value;
3109         long completion_rc;
3110         int rc = 0;
3111
3112         timeout = hl_usecs64_to_jiffies(timeout_us);
3113
3114         hl_ctx_get(hdev, ctx);
3115
3116         pend = kzalloc(sizeof(*pend), GFP_KERNEL);
3117         if (!pend) {
3118                 hl_ctx_put(ctx);
3119                 return -ENOMEM;
3120         }
3121
3122         hl_fence_init(&pend->fence, ULONG_MAX);
3123
3124         /* Add pending user interrupt to relevant list for the interrupt
3125          * handler to monitor
3126          */
3127         spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3128         list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
3129         spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3130
3131         /* We check for completion value as interrupt could have been received
3132          * before we added the node to the wait list
3133          */
3134         if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
3135                 dev_err(hdev->dev, "Failed to copy completion value from user\n");
3136                 rc = -EFAULT;
3137                 goto remove_pending_user_interrupt;
3138         }
3139
3140         if (completion_value >= target_value) {
3141                 *status = HL_WAIT_CS_STATUS_COMPLETED;
3142                 /* There was no interrupt, we assume the completion is now. */
3143                 pend->fence.timestamp = ktime_get();
3144         } else {
3145                 *status = HL_WAIT_CS_STATUS_BUSY;
3146         }
3147
3148         if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED))
3149                 goto remove_pending_user_interrupt;
3150
3151 wait_again:
3152         /* Wait for interrupt handler to signal completion */
3153         completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
3154                                                                                 timeout);
3155
3156         /* If timeout did not expire we need to perform the comparison.
3157          * If comparison fails, keep waiting until timeout expires
3158          */
3159         if (completion_rc > 0) {
3160                 spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3161                 /* reinit_completion must be called before we check for user
3162                  * completion value, otherwise, if interrupt is received after
3163                  * the comparison and before the next wait_for_completion,
3164                  * we will reach timeout and fail
3165                  */
3166                 reinit_completion(&pend->fence.completion);
3167                 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3168
3169                 if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
3170                         dev_err(hdev->dev, "Failed to copy completion value from user\n");
3171                         rc = -EFAULT;
3172
3173                         goto remove_pending_user_interrupt;
3174                 }
3175
3176                 if (completion_value >= target_value) {
3177                         *status = HL_WAIT_CS_STATUS_COMPLETED;
3178                 } else if (pend->fence.error) {
3179                         dev_err_ratelimited(hdev->dev,
3180                                 "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
3181                                 pend->fence.error);
3182                         /* set the command completion status as ABORTED */
3183                         *status = HL_WAIT_CS_STATUS_ABORTED;
3184                 } else {
3185                         timeout = completion_rc;
3186                         goto wait_again;
3187                 }
3188         } else if (completion_rc == -ERESTARTSYS) {
3189                 dev_err_ratelimited(hdev->dev,
3190                         "user process got signal while waiting for interrupt ID %d\n",
3191                         interrupt->interrupt_id);
3192                 rc = -EINTR;
3193         } else {
3194                 /* The wait has timed-out. We don't know anything beyond that
3195                  * because the workload wasn't submitted through the driver.
3196                  * Therefore, from driver's perspective, the workload is still
3197                  * executing.
3198                  */
3199                 rc = 0;
3200                 *status = HL_WAIT_CS_STATUS_BUSY;
3201         }
3202
3203 remove_pending_user_interrupt:
3204         spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3205         list_del(&pend->wait_list_node);
3206         spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3207
3208         *timestamp = ktime_to_ns(pend->fence.timestamp);
3209
3210         kfree(pend);
3211         hl_ctx_put(ctx);
3212
3213         return rc;
3214 }
3215
3216 static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3217 {
3218         u16 interrupt_id, first_interrupt, last_interrupt;
3219         struct hl_device *hdev = hpriv->hdev;
3220         struct asic_fixed_properties *prop;
3221         struct hl_user_interrupt *interrupt;
3222         union hl_wait_cs_args *args = data;
3223         u32 status = HL_WAIT_CS_STATUS_BUSY;
3224         u64 timestamp;
3225         int rc;
3226
3227         prop = &hdev->asic_prop;
3228
3229         if (!prop->user_interrupt_count) {
3230                 dev_err(hdev->dev, "no user interrupts allowed");
3231                 return -EPERM;
3232         }
3233
3234         interrupt_id = FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags);
3235
3236         first_interrupt = prop->first_available_user_msix_interrupt;
3237         last_interrupt = prop->first_available_user_msix_interrupt +
3238                                                 prop->user_interrupt_count - 1;
3239
3240         if ((interrupt_id < first_interrupt || interrupt_id > last_interrupt) &&
3241                         interrupt_id != HL_COMMON_USER_INTERRUPT_ID) {
3242                 dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id);
3243                 return -EINVAL;
3244         }
3245
3246         if (interrupt_id == HL_COMMON_USER_INTERRUPT_ID)
3247                 interrupt = &hdev->common_user_interrupt;
3248         else
3249                 interrupt = &hdev->user_interrupt[interrupt_id - first_interrupt];
3250
3251         if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ)
3252                 rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->cb_mgr, &hpriv->ts_mem_mgr,
3253                                 args->in.interrupt_timeout_us, args->in.cq_counters_handle,
3254                                 args->in.cq_counters_offset,
3255                                 args->in.target, interrupt,
3256                                 !!(args->in.flags & HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT),
3257                                 args->in.timestamp_handle, args->in.timestamp_offset,
3258                                 &status, &timestamp);
3259         else
3260                 rc = _hl_interrupt_wait_ioctl_user_addr(hdev, hpriv->ctx,
3261                                 args->in.interrupt_timeout_us, args->in.addr,
3262                                 args->in.target, interrupt, &status,
3263                                 &timestamp);
3264         if (rc)
3265                 return rc;
3266
3267         memset(args, 0, sizeof(*args));
3268         args->out.status = status;
3269
3270         if (timestamp) {
3271                 args->out.timestamp_nsec = timestamp;
3272                 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3273         }
3274
3275         return 0;
3276 }
3277
3278 int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3279 {
3280         union hl_wait_cs_args *args = data;
3281         u32 flags = args->in.flags;
3282         int rc;
3283
3284         /* If the device is not operational, no point in waiting for any command submission or
3285          * user interrupt
3286          */
3287         if (!hl_device_operational(hpriv->hdev, NULL))
3288                 return -EBUSY;
3289
3290         if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
3291                 rc = hl_interrupt_wait_ioctl(hpriv, data);
3292         else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS)
3293                 rc = hl_multi_cs_wait_ioctl(hpriv, data);
3294         else
3295                 rc = hl_cs_wait_ioctl(hpriv, data);
3296
3297         return rc;
3298 }