kernel/sched_rt.c

   1 /*
   2  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3  * policies)
   4  */
   5
   6 #ifdef CONFIG_SMP
   7
   8 static inline int rt_overloaded(struct rq *rq)
   9 {
  10         return atomic_read(&rq->rd->rto_count);
  11 }
  12
  13 static inline void rt_set_overload(struct rq *rq)
  14 {
  15         cpu_set(rq->cpu, rq->rd->rto_mask);
  16         /*
  17          * Make sure the mask is visible before we set
  18          * the overload count. That is checked to determine
  19          * if we should look at the mask. It would be a shame
  20          * if we looked at the mask, but the mask was not
  21          * updated yet.
  22          */
  23         wmb();
  24         atomic_inc(&rq->rd->rto_count);
  25 }
  26
  27 static inline void rt_clear_overload(struct rq *rq)
  28 {
  29         /* the order here really doesn't matter */
  30         atomic_dec(&rq->rd->rto_count);
  31         cpu_clear(rq->cpu, rq->rd->rto_mask);
  32 }
  33
  34 static void update_rt_migration(struct rq *rq)
  35 {
  36         if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
  37                 if (!rq->rt.overloaded) {
  38                         rt_set_overload(rq);
  39                         rq->rt.overloaded = 1;
  40                 }
  41         } else if (rq->rt.overloaded) {
  42                 rt_clear_overload(rq);
  43                 rq->rt.overloaded = 0;
  44         }
  45 }
  46 #endif /* CONFIG_SMP */
  47
  48 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  49 {
  50         return container_of(rt_se, struct task_struct, rt);
  51 }
  52
  53 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
  54 {
  55         return !list_empty(&rt_se->run_list);
  56 }
  57
  58 #ifdef CONFIG_RT_GROUP_SCHED
  59
  60 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
  61 {
  62         if (!rt_rq->tg)
  63                 return RUNTIME_INF;
  64
  65         return rt_rq->tg->rt_runtime;
  66 }
  67
  68 #define for_each_leaf_rt_rq(rt_rq, rq) \
  69         list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
  70
  71 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  72 {
  73         return rt_rq->rq;
  74 }
  75
  76 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  77 {
  78         return rt_se->rt_rq;
  79 }
  80
  81 #define for_each_sched_rt_entity(rt_se) \
  82         for (; rt_se; rt_se = rt_se->parent)
  83
  84 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
  85 {
  86         return rt_se->my_q;
  87 }
  88
  89 static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
  90 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
  91
  92 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  93 {
  94         struct sched_rt_entity *rt_se = rt_rq->rt_se;
  95
  96         if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
  97                 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
  98
  99                 enqueue_rt_entity(rt_se);
 100                 if (rt_rq->highest_prio < curr->prio)
 101                         resched_task(curr);
 102         }
 103 }
 104
 105 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 106 {
 107         struct sched_rt_entity *rt_se = rt_rq->rt_se;
 108
 109         if (rt_se && on_rt_rq(rt_se))
 110                 dequeue_rt_entity(rt_se);
 111 }
 112
 113 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 114 {
 115         return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 116 }
 117
 118 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 119 {
 120         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 121         struct task_struct *p;
 122
 123         if (rt_rq)
 124                 return !!rt_rq->rt_nr_boosted;
 125
 126         p = rt_task_of(rt_se);
 127         return p->prio != p->normal_prio;
 128 }
 129
 130 #else
 131
 132 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 133 {
 134         if (sysctl_sched_rt_runtime == -1)
 135                 return RUNTIME_INF;
 136
 137         return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 138 }
 139
 140 #define for_each_leaf_rt_rq(rt_rq, rq) \
 141         for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 142
 143 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 144 {
 145         return container_of(rt_rq, struct rq, rt);
 146 }
 147
 148 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 149 {
 150         struct task_struct *p = rt_task_of(rt_se);
 151         struct rq *rq = task_rq(p);
 152
 153         return &rq->rt;
 154 }
 155
 156 #define for_each_sched_rt_entity(rt_se) \
 157         for (; rt_se; rt_se = NULL)
 158
 159 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 160 {
 161         return NULL;
 162 }
 163
 164 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 165 {
 166 }
 167
 168 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 169 {
 170 }
 171
 172 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 173 {
 174         return rt_rq->rt_throttled;
 175 }
 176 #endif
 177
 178 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 179 {
 180 #ifdef CONFIG_RT_GROUP_SCHED
 181         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 182
 183         if (rt_rq)
 184                 return rt_rq->highest_prio;
 185 #endif
 186
 187         return rt_task_of(rt_se)->prio;
 188 }
 189
 190 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 191 {
 192         u64 runtime = sched_rt_runtime(rt_rq);
 193
 194         if (runtime == RUNTIME_INF)
 195                 return 0;
 196
 197         if (rt_rq->rt_throttled)
 198                 return rt_rq_throttled(rt_rq);
 199
 200         if (rt_rq->rt_time > runtime) {
 201                 struct rq *rq = rq_of_rt_rq(rt_rq);
 202
 203                 rq->rt_throttled = 1;
 204                 rt_rq->rt_throttled = 1;
 205
 206                 if (rt_rq_throttled(rt_rq)) {
 207                         sched_rt_rq_dequeue(rt_rq);
 208                         return 1;
 209                 }
 210         }
 211
 212         return 0;
 213 }
 214
 215 static void update_sched_rt_period(struct rq *rq)
 216 {
 217         struct rt_rq *rt_rq;
 218         u64 period;
 219
 220         while (rq->clock > rq->rt_period_expire) {
 221                 period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 222                 rq->rt_period_expire += period;
 223
 224                 for_each_leaf_rt_rq(rt_rq, rq) {
 225                         u64 runtime = sched_rt_runtime(rt_rq);
 226
 227                         rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
 228                         if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 229                                 rt_rq->rt_throttled = 0;
 230                                 sched_rt_rq_enqueue(rt_rq);
 231                         }
 232                 }
 233
 234                 rq->rt_throttled = 0;
 235         }
 236 }
 237
 238 /*
 239  * Update the current task's runtime statistics. Skip current tasks that
 240  * are not in our scheduling class.
 241  */
 242 static void update_curr_rt(struct rq *rq)
 243 {
 244         struct task_struct *curr = rq->curr;
 245         struct sched_rt_entity *rt_se = &curr->rt;
 246         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 247         u64 delta_exec;
 248
 249         if (!task_has_rt_policy(curr))
 250                 return;
 251
 252         delta_exec = rq->clock - curr->se.exec_start;
 253         if (unlikely((s64)delta_exec < 0))
 254                 delta_exec = 0;
 255
 256         schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 257
 258         curr->se.sum_exec_runtime += delta_exec;
 259         curr->se.exec_start = rq->clock;
 260         cpuacct_charge(curr, delta_exec);
 261
 262         rt_rq->rt_time += delta_exec;
 263         if (sched_rt_runtime_exceeded(rt_rq))
 264                 resched_task(curr);
 265 }
 266
 267 static inline
 268 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 269 {
 270         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 271         rt_rq->rt_nr_running++;
 272 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 273         if (rt_se_prio(rt_se) < rt_rq->highest_prio)
 274                 rt_rq->highest_prio = rt_se_prio(rt_se);
 275 #endif
 276 #ifdef CONFIG_SMP
 277         if (rt_se->nr_cpus_allowed > 1) {
 278                 struct rq *rq = rq_of_rt_rq(rt_rq);
 279                 rq->rt.rt_nr_migratory++;
 280         }
 281
 282         update_rt_migration(rq_of_rt_rq(rt_rq));
 283 #endif
 284 #ifdef CONFIG_RT_GROUP_SCHED
 285         if (rt_se_boosted(rt_se))
 286                 rt_rq->rt_nr_boosted++;
 287 #endif
 288 }
 289
 290 static inline
 291 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 292 {
 293         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 294         WARN_ON(!rt_rq->rt_nr_running);
 295         rt_rq->rt_nr_running--;
 296 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 297         if (rt_rq->rt_nr_running) {
 298                 struct rt_prio_array *array;
 299
 300                 WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
 301                 if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
 302                         /* recalculate */
 303                         array = &rt_rq->active;
 304                         rt_rq->highest_prio =
 305                                 sched_find_first_bit(array->bitmap);
 306                 } /* otherwise leave rq->highest prio alone */
 307         } else
 308                 rt_rq->highest_prio = MAX_RT_PRIO;
 309 #endif
 310 #ifdef CONFIG_SMP
 311         if (rt_se->nr_cpus_allowed > 1) {
 312                 struct rq *rq = rq_of_rt_rq(rt_rq);
 313                 rq->rt.rt_nr_migratory--;
 314         }
 315
 316         update_rt_migration(rq_of_rt_rq(rt_rq));
 317 #endif /* CONFIG_SMP */
 318 #ifdef CONFIG_RT_GROUP_SCHED
 319         if (rt_se_boosted(rt_se))
 320                 rt_rq->rt_nr_boosted--;
 321
 322         WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
 323 #endif
 324 }
 325
 326 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 327 {
 328         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 329         struct rt_prio_array *array = &rt_rq->active;
 330         struct rt_rq *group_rq = group_rt_rq(rt_se);
 331
 332         if (group_rq && rt_rq_throttled(group_rq))
 333                 return;
 334
 335         list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
 336         __set_bit(rt_se_prio(rt_se), array->bitmap);
 337
 338         inc_rt_tasks(rt_se, rt_rq);
 339 }
 340
 341 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 342 {
 343         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 344         struct rt_prio_array *array = &rt_rq->active;
 345
 346         list_del_init(&rt_se->run_list);
 347         if (list_empty(array->queue + rt_se_prio(rt_se)))
 348                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
 349
 350         dec_rt_tasks(rt_se, rt_rq);
 351 }
 352
 353 /*
 354  * Because the prio of an upper entry depends on the lower
 355  * entries, we must remove entries top - down.
 356  *
 357  * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
 358  *      doesn't matter much for now, as h=2 for GROUP_SCHED.
 359  */
 360 static void dequeue_rt_stack(struct task_struct *p)
 361 {
 362         struct sched_rt_entity *rt_se, *top_se;
 363
 364         /*
 365          * dequeue all, top - down.
 366          */
 367         do {
 368                 rt_se = &p->rt;
 369                 top_se = NULL;
 370                 for_each_sched_rt_entity(rt_se) {
 371                         if (on_rt_rq(rt_se))
 372                                 top_se = rt_se;
 373                 }
 374                 if (top_se)
 375                         dequeue_rt_entity(top_se);
 376         } while (top_se);
 377 }
 378
 379 /*
 380  * Adding/removing a task to/from a priority array:
 381  */
 382 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 383 {
 384         struct sched_rt_entity *rt_se = &p->rt;
 385
 386         if (wakeup)
 387                 rt_se->timeout = 0;
 388
 389         dequeue_rt_stack(p);
 390
 391         /*
 392          * enqueue everybody, bottom - up.
 393          */
 394         for_each_sched_rt_entity(rt_se)
 395                 enqueue_rt_entity(rt_se);
 396 }
 397
 398 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 399 {
 400         struct sched_rt_entity *rt_se = &p->rt;
 401         struct rt_rq *rt_rq;
 402
 403         update_curr_rt(rq);
 404
 405         dequeue_rt_stack(p);
 406
 407         /*
 408          * re-enqueue all non-empty rt_rq entities.
 409          */
 410         for_each_sched_rt_entity(rt_se) {
 411                 rt_rq = group_rt_rq(rt_se);
 412                 if (rt_rq && rt_rq->rt_nr_running)
 413                         enqueue_rt_entity(rt_se);
 414         }
 415 }
 416
 417 /*
 418  * Put task to the end of the run list without the overhead of dequeue
 419  * followed by enqueue.
 420  */
 421 static
 422 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 423 {
 424         struct rt_prio_array *array = &rt_rq->active;
 425
 426         list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
 427 }
 428
 429 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 430 {
 431         struct sched_rt_entity *rt_se = &p->rt;
 432         struct rt_rq *rt_rq;
 433
 434         for_each_sched_rt_entity(rt_se) {
 435                 rt_rq = rt_rq_of_se(rt_se);
 436                 requeue_rt_entity(rt_rq, rt_se);
 437         }
 438 }
 439
 440 static void yield_task_rt(struct rq *rq)
 441 {
 442         requeue_task_rt(rq, rq->curr);
 443 }
 444
 445 #ifdef CONFIG_SMP
 446 static int find_lowest_rq(struct task_struct *task);
 447
 448 static int select_task_rq_rt(struct task_struct *p, int sync)
 449 {
 450         struct rq *rq = task_rq(p);
 451
 452         /*
 453          * If the current task is an RT task, then
 454          * try to see if we can wake this RT task up on another
 455          * runqueue. Otherwise simply start this RT task
 456          * on its current runqueue.
 457          *
 458          * We want to avoid overloading runqueues. Even if
 459          * the RT task is of higher priority than the current RT task.
 460          * RT tasks behave differently than other tasks. If
 461          * one gets preempted, we try to push it off to another queue.
 462          * So trying to keep a preempting RT task on the same
 463          * cache hot CPU will force the running RT task to
 464          * a cold CPU. So we waste all the cache for the lower
 465          * RT task in hopes of saving some of a RT task
 466          * that is just being woken and probably will have
 467          * cold cache anyway.
 468          */
 469         if (unlikely(rt_task(rq->curr)) &&
 470             (p->rt.nr_cpus_allowed > 1)) {
 471                 int cpu = find_lowest_rq(p);
 472
 473                 return (cpu == -1) ? task_cpu(p) : cpu;
 474         }
 475
 476         /*
 477          * Otherwise, just let it ride on the affined RQ and the
 478          * post-schedule router will push the preempted task away
 479          */
 480         return task_cpu(p);
 481 }
 482 #endif /* CONFIG_SMP */
 483
 484 /*
 485  * Preempt the current task with a newly woken task if needed:
 486  */
 487 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 488 {
 489         if (p->prio < rq->curr->prio)
 490                 resched_task(rq->curr);
 491 }
 492
 493 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 494                                                    struct rt_rq *rt_rq)
 495 {
 496         struct rt_prio_array *array = &rt_rq->active;
 497         struct sched_rt_entity *next = NULL;
 498         struct list_head *queue;
 499         int idx;
 500
 501         idx = sched_find_first_bit(array->bitmap);
 502         BUG_ON(idx >= MAX_RT_PRIO);
 503
 504         queue = array->queue + idx;
 505         next = list_entry(queue->next, struct sched_rt_entity, run_list);
 506
 507         return next;
 508 }
 509
 510 static struct task_struct *pick_next_task_rt(struct rq *rq)
 511 {
 512         struct sched_rt_entity *rt_se;
 513         struct task_struct *p;
 514         struct rt_rq *rt_rq;
 515
 516         rt_rq = &rq->rt;
 517
 518         if (unlikely(!rt_rq->rt_nr_running))
 519                 return NULL;
 520
 521         if (rt_rq_throttled(rt_rq))
 522                 return NULL;
 523
 524         do {
 525                 rt_se = pick_next_rt_entity(rq, rt_rq);
 526                 BUG_ON(!rt_se);
 527                 rt_rq = group_rt_rq(rt_se);
 528         } while (rt_rq);
 529
 530         p = rt_task_of(rt_se);
 531         p->se.exec_start = rq->clock;
 532         return p;
 533 }
 534
 535 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 536 {
 537         update_curr_rt(rq);
 538         p->se.exec_start = 0;
 539 }
 540
 541 #ifdef CONFIG_SMP
 542
 543 /* Only try algorithms three times */
 544 #define RT_MAX_TRIES 3
 545
 546 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 547 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 548
 549 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 550 {
 551         if (!task_running(rq, p) &&
 552             (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
 553             (p->rt.nr_cpus_allowed > 1))
 554                 return 1;
 555         return 0;
 556 }
 557
 558 /* Return the second highest RT task, NULL otherwise */
 559 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 560 {
 561         struct task_struct *next = NULL;
 562         struct sched_rt_entity *rt_se;
 563         struct rt_prio_array *array;
 564         struct rt_rq *rt_rq;
 565         int idx;
 566
 567         for_each_leaf_rt_rq(rt_rq, rq) {
 568                 array = &rt_rq->active;
 569                 idx = sched_find_first_bit(array->bitmap);
 570  next_idx:
 571                 if (idx >= MAX_RT_PRIO)
 572                         continue;
 573                 if (next && next->prio < idx)
 574                         continue;
 575                 list_for_each_entry(rt_se, array->queue + idx, run_list) {
 576                         struct task_struct *p = rt_task_of(rt_se);
 577                         if (pick_rt_task(rq, p, cpu)) {
 578                                 next = p;
 579                                 break;
 580                         }
 581                 }
 582                 if (!next) {
 583                         idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
 584                         goto next_idx;
 585                 }
 586         }
 587
 588         return next;
 589 }
 590
 591 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 592
 593 static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 594 {
 595         int       lowest_prio = -1;
 596         int       lowest_cpu  = -1;
 597         int       count       = 0;
 598         int       cpu;
 599
 600         cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
 601
 602         /*
 603          * Scan each rq for the lowest prio.
 604          */
 605         for_each_cpu_mask(cpu, *lowest_mask) {
 606                 struct rq *rq = cpu_rq(cpu);
 607
 608                 /* We look for lowest RT prio or non-rt CPU */
 609                 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
 610                         /*
 611                          * if we already found a low RT queue
 612                          * and now we found this non-rt queue
 613                          * clear the mask and set our bit.
 614                          * Otherwise just return the queue as is
 615                          * and the count==1 will cause the algorithm
 616                          * to use the first bit found.
 617                          */
 618                         if (lowest_cpu != -1) {
 619                                 cpus_clear(*lowest_mask);
 620                                 cpu_set(rq->cpu, *lowest_mask);
 621                         }
 622                         return 1;
 623                 }
 624
 625                 /* no locking for now */
 626                 if ((rq->rt.highest_prio > task->prio)
 627                     && (rq->rt.highest_prio >= lowest_prio)) {
 628                         if (rq->rt.highest_prio > lowest_prio) {
 629                                 /* new low - clear old data */
 630                                 lowest_prio = rq->rt.highest_prio;
 631                                 lowest_cpu = cpu;
 632                                 count = 0;
 633                         }
 634                         count++;
 635                 } else
 636                         cpu_clear(cpu, *lowest_mask);
 637         }
 638
 639         /*
 640          * Clear out all the set bits that represent
 641          * runqueues that were of higher prio than
 642          * the lowest_prio.
 643          */
 644         if (lowest_cpu > 0) {
 645                 /*
 646                  * Perhaps we could add another cpumask op to
 647                  * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
 648                  * Then that could be optimized to use memset and such.
 649                  */
 650                 for_each_cpu_mask(cpu, *lowest_mask) {
 651                         if (cpu >= lowest_cpu)
 652                                 break;
 653                         cpu_clear(cpu, *lowest_mask);
 654                 }
 655         }
 656
 657         return count;
 658 }
 659
 660 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 661 {
 662         int first;
 663
 664         /* "this_cpu" is cheaper to preempt than a remote processor */
 665         if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
 666                 return this_cpu;
 667
 668         first = first_cpu(*mask);
 669         if (first != NR_CPUS)
 670                 return first;
 671
 672         return -1;
 673 }
 674
 675 static int find_lowest_rq(struct task_struct *task)
 676 {
 677         struct sched_domain *sd;
 678         cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
 679         int this_cpu = smp_processor_id();
 680         int cpu      = task_cpu(task);
 681         int count    = find_lowest_cpus(task, lowest_mask);
 682
 683         if (!count)
 684                 return -1; /* No targets found */
 685
 686         /*
 687          * There is no sense in performing an optimal search if only one
 688          * target is found.
 689          */
 690         if (count == 1)
 691                 return first_cpu(*lowest_mask);
 692
 693         /*
 694          * At this point we have built a mask of cpus representing the
 695          * lowest priority tasks in the system.  Now we want to elect
 696          * the best one based on our affinity and topology.
 697          *
 698          * We prioritize the last cpu that the task executed on since
 699          * it is most likely cache-hot in that location.
 700          */
 701         if (cpu_isset(cpu, *lowest_mask))
 702                 return cpu;
 703
 704         /*
 705          * Otherwise, we consult the sched_domains span maps to figure
 706          * out which cpu is logically closest to our hot cache data.
 707          */
 708         if (this_cpu == cpu)
 709                 this_cpu = -1; /* Skip this_cpu opt if the same */
 710
 711         for_each_domain(cpu, sd) {
 712                 if (sd->flags & SD_WAKE_AFFINE) {
 713                         cpumask_t domain_mask;
 714                         int       best_cpu;
 715
 716                         cpus_and(domain_mask, sd->span, *lowest_mask);
 717
 718                         best_cpu = pick_optimal_cpu(this_cpu,
 719                                                     &domain_mask);
 720                         if (best_cpu != -1)
 721                                 return best_cpu;
 722                 }
 723         }
 724
 725         /*
 726          * And finally, if there were no matches within the domains
 727          * just give the caller *something* to work with from the compatible
 728          * locations.
 729          */
 730         return pick_optimal_cpu(this_cpu, lowest_mask);
 731 }
 732
 733 /* Will lock the rq it finds */
 734 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 735 {
 736         struct rq *lowest_rq = NULL;
 737         int tries;
 738         int cpu;
 739
 740         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
 741                 cpu = find_lowest_rq(task);
 742
 743                 if ((cpu == -1) || (cpu == rq->cpu))
 744                         break;
 745
 746                 lowest_rq = cpu_rq(cpu);
 747
 748                 /* if the prio of this runqueue changed, try again */
 749                 if (double_lock_balance(rq, lowest_rq)) {
 750                         /*
 751                          * We had to unlock the run queue. In
 752                          * the mean time, task could have
 753                          * migrated already or had its affinity changed.
 754                          * Also make sure that it wasn't scheduled on its rq.
 755                          */
 756                         if (unlikely(task_rq(task) != rq ||
 757                                      !cpu_isset(lowest_rq->cpu,
 758                                                 task->cpus_allowed) ||
 759                                      task_running(rq, task) ||
 760                                      !task->se.on_rq)) {
 761
 762                                 spin_unlock(&lowest_rq->lock);
 763                                 lowest_rq = NULL;
 764                                 break;
 765                         }
 766                 }
 767
 768                 /* If this rq is still suitable use it. */
 769                 if (lowest_rq->rt.highest_prio > task->prio)
 770                         break;
 771
 772                 /* try again */
 773                 spin_unlock(&lowest_rq->lock);
 774                 lowest_rq = NULL;
 775         }
 776
 777         return lowest_rq;
 778 }
 779
 780 /*
 781  * If the current CPU has more than one RT task, see if the non
 782  * running task can migrate over to a CPU that is running a task
 783  * of lesser priority.
 784  */
 785 static int push_rt_task(struct rq *rq)
 786 {
 787         struct task_struct *next_task;
 788         struct rq *lowest_rq;
 789         int ret = 0;
 790         int paranoid = RT_MAX_TRIES;
 791
 792         if (!rq->rt.overloaded)
 793                 return 0;
 794
 795         next_task = pick_next_highest_task_rt(rq, -1);
 796         if (!next_task)
 797                 return 0;
 798
 799  retry:
 800         if (unlikely(next_task == rq->curr)) {
 801                 WARN_ON(1);
 802                 return 0;
 803         }
 804
 805         /*
 806          * It's possible that the next_task slipped in of
 807          * higher priority than current. If that's the case
 808          * just reschedule current.
 809          */
 810         if (unlikely(next_task->prio < rq->curr->prio)) {
 811                 resched_task(rq->curr);
 812                 return 0;
 813         }
 814
 815         /* We might release rq lock */
 816         get_task_struct(next_task);
 817
 818         /* find_lock_lowest_rq locks the rq if found */
 819         lowest_rq = find_lock_lowest_rq(next_task, rq);
 820         if (!lowest_rq) {
 821                 struct task_struct *task;
 822                 /*
 823                  * find lock_lowest_rq releases rq->lock
 824                  * so it is possible that next_task has changed.
 825                  * If it has, then try again.
 826                  */
 827                 task = pick_next_highest_task_rt(rq, -1);
 828                 if (unlikely(task != next_task) && task && paranoid--) {
 829                         put_task_struct(next_task);
 830                         next_task = task;
 831                         goto retry;
 832                 }
 833                 goto out;
 834         }
 835
 836         deactivate_task(rq, next_task, 0);
 837         set_task_cpu(next_task, lowest_rq->cpu);
 838         activate_task(lowest_rq, next_task, 0);
 839
 840         resched_task(lowest_rq->curr);
 841
 842         spin_unlock(&lowest_rq->lock);
 843
 844         ret = 1;
 845 out:
 846         put_task_struct(next_task);
 847
 848         return ret;
 849 }
 850
 851 /*
 852  * TODO: Currently we just use the second highest prio task on
 853  *       the queue, and stop when it can't migrate (or there's
 854  *       no more RT tasks).  There may be a case where a lower
 855  *       priority RT task has a different affinity than the
 856  *       higher RT task. In this case the lower RT task could
 857  *       possibly be able to migrate where as the higher priority
 858  *       RT task could not.  We currently ignore this issue.
 859  *       Enhancements are welcome!
 860  */
 861 static void push_rt_tasks(struct rq *rq)
 862 {
 863         /* push_rt_task will return true if it moved an RT */
 864         while (push_rt_task(rq))
 865                 ;
 866 }
 867
 868 static int pull_rt_task(struct rq *this_rq)
 869 {
 870         int this_cpu = this_rq->cpu, ret = 0, cpu;
 871         struct task_struct *p, *next;
 872         struct rq *src_rq;
 873
 874         if (likely(!rt_overloaded(this_rq)))
 875                 return 0;
 876
 877         next = pick_next_task_rt(this_rq);
 878
 879         for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
 880                 if (this_cpu == cpu)
 881                         continue;
 882
 883                 src_rq = cpu_rq(cpu);
 884                 /*
 885                  * We can potentially drop this_rq's lock in
 886                  * double_lock_balance, and another CPU could
 887                  * steal our next task - hence we must cause
 888                  * the caller to recalculate the next task
 889                  * in that case:
 890                  */
 891                 if (double_lock_balance(this_rq, src_rq)) {
 892                         struct task_struct *old_next = next;
 893
 894                         next = pick_next_task_rt(this_rq);
 895                         if (next != old_next)
 896                                 ret = 1;
 897                 }
 898
 899                 /*
 900                  * Are there still pullable RT tasks?
 901                  */
 902                 if (src_rq->rt.rt_nr_running <= 1)
 903                         goto skip;
 904
 905                 p = pick_next_highest_task_rt(src_rq, this_cpu);
 906
 907                 /*
 908                  * Do we have an RT task that preempts
 909                  * the to-be-scheduled task?
 910                  */
 911                 if (p && (!next || (p->prio < next->prio))) {
 912                         WARN_ON(p == src_rq->curr);
 913                         WARN_ON(!p->se.on_rq);
 914
 915                         /*
 916                          * There's a chance that p is higher in priority
 917                          * than what's currently running on its cpu.
 918                          * This is just that p is wakeing up and hasn't
 919                          * had a chance to schedule. We only pull
 920                          * p if it is lower in priority than the
 921                          * current task on the run queue or
 922                          * this_rq next task is lower in prio than
 923                          * the current task on that rq.
 924                          */
 925                         if (p->prio < src_rq->curr->prio ||
 926                             (next && next->prio < src_rq->curr->prio))
 927                                 goto skip;
 928
 929                         ret = 1;
 930
 931                         deactivate_task(src_rq, p, 0);
 932                         set_task_cpu(p, this_cpu);
 933                         activate_task(this_rq, p, 0);
 934                         /*
 935                          * We continue with the search, just in
 936                          * case there's an even higher prio task
 937                          * in another runqueue. (low likelyhood
 938                          * but possible)
 939                          *
 940                          * Update next so that we won't pick a task
 941                          * on another cpu with a priority lower (or equal)
 942                          * than the one we just picked.
 943                          */
 944                         next = p;
 945
 946                 }
 947  skip:
 948                 spin_unlock(&src_rq->lock);
 949         }
 950
 951         return ret;
 952 }
 953
 954 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 955 {
 956         /* Try to pull RT tasks here if we lower this rq's prio */
 957         if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
 958                 pull_rt_task(rq);
 959 }
 960
 961 static void post_schedule_rt(struct rq *rq)
 962 {
 963         /*
 964          * If we have more than one rt_task queued, then
 965          * see if we can push the other rt_tasks off to other CPUS.
 966          * Note we may release the rq lock, and since
 967          * the lock was owned by prev, we need to release it
 968          * first via finish_lock_switch and then reaquire it here.
 969          */
 970         if (unlikely(rq->rt.overloaded)) {
 971                 spin_lock_irq(&rq->lock);
 972                 push_rt_tasks(rq);
 973                 spin_unlock_irq(&rq->lock);
 974         }
 975 }
 976
 977
 978 static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 979 {
 980         if (!task_running(rq, p) &&
 981             (p->prio >= rq->rt.highest_prio) &&
 982             rq->rt.overloaded)
 983                 push_rt_tasks(rq);
 984 }
 985
 986 static unsigned long
 987 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 988                 unsigned long max_load_move,
 989                 struct sched_domain *sd, enum cpu_idle_type idle,
 990                 int *all_pinned, int *this_best_prio)
 991 {
 992         /* don't touch RT tasks */
 993         return 0;
 994 }
 995
 996 static int
 997 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 998                  struct sched_domain *sd, enum cpu_idle_type idle)
 999 {
1000         /* don't touch RT tasks */
1001         return 0;
1002 }
1003
1004 static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
1005 {
1006         int weight = cpus_weight(*new_mask);
1007
1008         BUG_ON(!rt_task(p));
1009
1010         /*
1011          * Update the migration status of the RQ if we have an RT task
1012          * which is running AND changing its weight value.
1013          */
1014         if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1015                 struct rq *rq = task_rq(p);
1016
1017                 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1018                         rq->rt.rt_nr_migratory++;
1019                 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1020                         BUG_ON(!rq->rt.rt_nr_migratory);
1021                         rq->rt.rt_nr_migratory--;
1022                 }
1023
1024                 update_rt_migration(rq);
1025         }
1026
1027         p->cpus_allowed    = *new_mask;
1028         p->rt.nr_cpus_allowed = weight;
1029 }
1030
1031 /* Assumes rq->lock is held */
1032 static void join_domain_rt(struct rq *rq)
1033 {
1034         if (rq->rt.overloaded)
1035                 rt_set_overload(rq);
1036 }
1037
1038 /* Assumes rq->lock is held */
1039 static void leave_domain_rt(struct rq *rq)
1040 {
1041         if (rq->rt.overloaded)
1042                 rt_clear_overload(rq);
1043 }
1044
1045 /*
1046  * When switch from the rt queue, we bring ourselves to a position
1047  * that we might want to pull RT tasks from other runqueues.
1048  */
1049 static void switched_from_rt(struct rq *rq, struct task_struct *p,
1050                            int running)
1051 {
1052         /*
1053          * If there are other RT tasks then we will reschedule
1054          * and the scheduling of the other RT tasks will handle
1055          * the balancing. But if we are the last RT task
1056          * we may need to handle the pulling of RT tasks
1057          * now.
1058          */
1059         if (!rq->rt.rt_nr_running)
1060                 pull_rt_task(rq);
1061 }
1062 #endif /* CONFIG_SMP */
1063
1064 /*
1065  * When switching a task to RT, we may overload the runqueue
1066  * with RT tasks. In this case we try to push them off to
1067  * other runqueues.
1068  */
1069 static void switched_to_rt(struct rq *rq, struct task_struct *p,
1070                            int running)
1071 {
1072         int check_resched = 1;
1073
1074         /*
1075          * If we are already running, then there's nothing
1076          * that needs to be done. But if we are not running
1077          * we may need to preempt the current running task.
1078          * If that current running task is also an RT task
1079          * then see if we can move to another run queue.
1080          */
1081         if (!running) {
1082 #ifdef CONFIG_SMP
1083                 if (rq->rt.overloaded && push_rt_task(rq) &&
1084                     /* Don't resched if we changed runqueues */
1085                     rq != task_rq(p))
1086                         check_resched = 0;
1087 #endif /* CONFIG_SMP */
1088                 if (check_resched && p->prio < rq->curr->prio)
1089                         resched_task(rq->curr);
1090         }
1091 }
1092
1093 /*
1094  * Priority of the task has changed. This may cause
1095  * us to initiate a push or pull.
1096  */
1097 static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1098                             int oldprio, int running)
1099 {
1100         if (running) {
1101 #ifdef CONFIG_SMP
1102                 /*
1103                  * If our priority decreases while running, we
1104                  * may need to pull tasks to this runqueue.
1105                  */
1106                 if (oldprio < p->prio)
1107                         pull_rt_task(rq);
1108                 /*
1109                  * If there's a higher priority task waiting to run
1110                  * then reschedule. Note, the above pull_rt_task
1111                  * can release the rq lock and p could migrate.
1112                  * Only reschedule if p is still on the same runqueue.
1113                  */
1114                 if (p->prio > rq->rt.highest_prio && rq->curr == p)
1115                         resched_task(p);
1116 #else
1117                 /* For UP simply resched on drop of prio */
1118                 if (oldprio < p->prio)
1119                         resched_task(p);
1120 #endif /* CONFIG_SMP */
1121         } else {
1122                 /*
1123                  * This task is not running, but if it is
1124                  * greater than the current running task
1125                  * then reschedule.
1126                  */
1127                 if (p->prio < rq->curr->prio)
1128                         resched_task(rq->curr);
1129         }
1130 }
1131
1132 static void watchdog(struct rq *rq, struct task_struct *p)
1133 {
1134         unsigned long soft, hard;
1135
1136         if (!p->signal)
1137                 return;
1138
1139         soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1140         hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1141
1142         if (soft != RLIM_INFINITY) {
1143                 unsigned long next;
1144
1145                 p->rt.timeout++;
1146                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1147                 if (p->rt.timeout > next)
1148                         p->it_sched_expires = p->se.sum_exec_runtime;
1149         }
1150 }
1151
1152 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1153 {
1154         update_curr_rt(rq);
1155
1156         watchdog(rq, p);
1157
1158         /*
1159          * RR tasks need a special form of timeslice management.
1160          * FIFO tasks have no timeslices.
1161          */
1162         if (p->policy != SCHED_RR)
1163                 return;
1164
1165         if (--p->rt.time_slice)
1166                 return;
1167
1168         p->rt.time_slice = DEF_TIMESLICE;
1169
1170         /*
1171          * Requeue to the end of queue if we are not the only element
1172          * on the queue:
1173          */
1174         if (p->rt.run_list.prev != p->rt.run_list.next) {
1175                 requeue_task_rt(rq, p);
1176                 set_tsk_need_resched(p);
1177         }
1178 }
1179
1180 static void set_curr_task_rt(struct rq *rq)
1181 {
1182         struct task_struct *p = rq->curr;
1183
1184         p->se.exec_start = rq->clock;
1185 }
1186
1187 const struct sched_class rt_sched_class = {
1188         .next                   = &fair_sched_class,
1189         .enqueue_task           = enqueue_task_rt,
1190         .dequeue_task           = dequeue_task_rt,
1191         .yield_task             = yield_task_rt,
1192 #ifdef CONFIG_SMP
1193         .select_task_rq         = select_task_rq_rt,
1194 #endif /* CONFIG_SMP */
1195
1196         .check_preempt_curr     = check_preempt_curr_rt,
1197
1198         .pick_next_task         = pick_next_task_rt,
1199         .put_prev_task          = put_prev_task_rt,
1200
1201 #ifdef CONFIG_SMP
1202         .load_balance           = load_balance_rt,
1203         .move_one_task          = move_one_task_rt,
1204         .set_cpus_allowed       = set_cpus_allowed_rt,
1205         .join_domain            = join_domain_rt,
1206         .leave_domain           = leave_domain_rt,
1207         .pre_schedule           = pre_schedule_rt,
1208         .post_schedule          = post_schedule_rt,
1209         .task_wake_up           = task_wake_up_rt,
1210         .switched_from          = switched_from_rt,
1211 #endif
1212
1213         .set_curr_task          = set_curr_task_rt,
1214         .task_tick              = task_tick_rt,
1215
1216         .prio_changed           = prio_changed_rt,
1217         .switched_to            = switched_to_rt,
1218 };