arch/ia64/kernel/domain.c

   1 /*
   2  * arch/ia64/kernel/domain.c
   3  * Architecture specific sched-domains builder.
   4  *
   5  * Copyright (C) 2004 Jesse Barnes
   6  * Copyright (C) 2004 Silicon Graphics, Inc.
   7  */
   8
   9 #include <linux/sched.h>
  10 #include <linux/percpu.h>
  11 #include <linux/slab.h>
  12 #include <linux/cpumask.h>
  13 #include <linux/init.h>
  14 #include <linux/topology.h>
  15 #include <linux/nodemask.h>
  16
  17 #define SD_NODES_PER_DOMAIN 16
  18
  19 #ifdef CONFIG_NUMA
  20 /**
  21  * find_next_best_node - find the next node to include in a sched_domain
  22  * @node: node whose sched_domain we're building
  23  * @used_nodes: nodes already in the sched_domain
  24  *
  25  * Find the next node to include in a given scheduling domain.  Simply
  26  * finds the closest node not already in the @used_nodes map.
  27  *
  28  * Should use nodemask_t.
  29  */
  30 static int find_next_best_node(int node, unsigned long *used_nodes)
  31 {
  32         int i, n, val, min_val, best_node = 0;
  33
  34         min_val = INT_MAX;
  35
  36         for (i = 0; i < MAX_NUMNODES; i++) {
  37                 /* Start at @node */
  38                 n = (node + i) % MAX_NUMNODES;
  39
  40                 if (!nr_cpus_node(n))
  41                         continue;
  42
  43                 /* Skip already used nodes */
  44                 if (test_bit(n, used_nodes))
  45                         continue;
  46
  47                 /* Simple min distance search */
  48                 val = node_distance(node, n);
  49
  50                 if (val < min_val) {
  51                         min_val = val;
  52                         best_node = n;
  53                 }
  54         }
  55
  56         set_bit(best_node, used_nodes);
  57         return best_node;
  58 }
  59
  60 /**
  61  * sched_domain_node_span - get a cpumask for a node's sched_domain
  62  * @node: node whose cpumask we're constructing
  63  * @size: number of nodes to include in this span
  64  *
  65  * Given a node, construct a good cpumask for its sched_domain to span.  It
  66  * should be one that prevents unnecessary balancing, but also spreads tasks
  67  * out optimally.
  68  */
  69 static cpumask_t sched_domain_node_span(int node)
  70 {
  71         int i;
  72         cpumask_t span, nodemask;
  73         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
  74
  75         cpus_clear(span);
  76         bitmap_zero(used_nodes, MAX_NUMNODES);
  77
  78         nodemask = node_to_cpumask(node);
  79         cpus_or(span, span, nodemask);
  80         set_bit(node, used_nodes);
  81
  82         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
  83                 int next_node = find_next_best_node(node, used_nodes);
  84                 nodemask = node_to_cpumask(next_node);
  85                 cpus_or(span, span, nodemask);
  86         }
  87
  88         return span;
  89 }
  90 #endif
  91
  92 /*
  93  * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
  94  * can switch it on easily if needed.
  95  */
  96 #ifdef CONFIG_SCHED_SMT
  97 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
  98 static struct sched_group sched_group_cpus[NR_CPUS];
  99 static int cpu_to_cpu_group(int cpu)
 100 {
 101         return cpu;
 102 }
 103 #endif
 104
 105 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 106 static struct sched_group sched_group_phys[NR_CPUS];
 107 static int cpu_to_phys_group(int cpu)
 108 {
 109 #ifdef CONFIG_SCHED_SMT
 110         return first_cpu(cpu_sibling_map[cpu]);
 111 #else
 112         return cpu;
 113 #endif
 114 }
 115
 116 #ifdef CONFIG_NUMA
 117 /*
 118  * The init_sched_build_groups can't handle what we want to do with node
 119  * groups, so roll our own. Now each node has its own list of groups which
 120  * gets dynamically allocated.
 121  */
 122 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 123 static struct sched_group *sched_group_nodes[MAX_NUMNODES];
 124
 125 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 126 static struct sched_group sched_group_allnodes[MAX_NUMNODES];
 127
 128 static int cpu_to_allnodes_group(int cpu)
 129 {
 130         return cpu_to_node(cpu);
 131 }
 132 #endif
 133
 134 /*
 135  * Build sched domains for a given set of cpus and attach the sched domains
 136  * to the individual cpus
 137  */
 138 void build_sched_domains(const cpumask_t *cpu_map)
 139 {
 140         int i;
 141
 142         /*
 143          * Set up domains for cpus specified by the cpu_map.
 144          */
 145         for_each_cpu_mask(i, *cpu_map) {
 146                 int group;
 147                 struct sched_domain *sd = NULL, *p;
 148                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 149
 150                 cpus_and(nodemask, nodemask, *cpu_map);
 151
 152 #ifdef CONFIG_NUMA
 153                 if (num_online_cpus()
 154                                 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 155                         sd = &per_cpu(allnodes_domains, i);
 156                         *sd = SD_ALLNODES_INIT;
 157                         sd->span = *cpu_map;
 158                         group = cpu_to_allnodes_group(i);
 159                         sd->groups = &sched_group_allnodes[group];
 160                         p = sd;
 161                 } else
 162                         p = NULL;
 163
 164                 sd = &per_cpu(node_domains, i);
 165                 *sd = SD_NODE_INIT;
 166                 sd->span = sched_domain_node_span(cpu_to_node(i));
 167                 sd->parent = p;
 168                 cpus_and(sd->span, sd->span, *cpu_map);
 169 #endif
 170
 171                 p = sd;
 172                 sd = &per_cpu(phys_domains, i);
 173                 group = cpu_to_phys_group(i);
 174                 *sd = SD_CPU_INIT;
 175                 sd->span = nodemask;
 176                 sd->parent = p;
 177                 sd->groups = &sched_group_phys[group];
 178
 179 #ifdef CONFIG_SCHED_SMT
 180                 p = sd;
 181                 sd = &per_cpu(cpu_domains, i);
 182                 group = cpu_to_cpu_group(i);
 183                 *sd = SD_SIBLING_INIT;
 184                 sd->span = cpu_sibling_map[i];
 185                 cpus_and(sd->span, sd->span, *cpu_map);
 186                 sd->parent = p;
 187                 sd->groups = &sched_group_cpus[group];
 188 #endif
 189         }
 190
 191 #ifdef CONFIG_SCHED_SMT
 192         /* Set up CPU (sibling) groups */
 193         for_each_cpu_mask(i, *cpu_map) {
 194                 cpumask_t this_sibling_map = cpu_sibling_map[i];
 195                 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 196                 if (i != first_cpu(this_sibling_map))
 197                         continue;
 198
 199                 init_sched_build_groups(sched_group_cpus, this_sibling_map,
 200                                                 &cpu_to_cpu_group);
 201         }
 202 #endif
 203
 204         /* Set up physical groups */
 205         for (i = 0; i < MAX_NUMNODES; i++) {
 206                 cpumask_t nodemask = node_to_cpumask(i);
 207
 208                 cpus_and(nodemask, nodemask, *cpu_map);
 209                 if (cpus_empty(nodemask))
 210                         continue;
 211
 212                 init_sched_build_groups(sched_group_phys, nodemask,
 213                                                 &cpu_to_phys_group);
 214         }
 215
 216 #ifdef CONFIG_NUMA
 217         init_sched_build_groups(sched_group_allnodes, *cpu_map,
 218                                 &cpu_to_allnodes_group);
 219
 220         for (i = 0; i < MAX_NUMNODES; i++) {
 221                 /* Set up node groups */
 222                 struct sched_group *sg, *prev;
 223                 cpumask_t nodemask = node_to_cpumask(i);
 224                 cpumask_t domainspan;
 225                 cpumask_t covered = CPU_MASK_NONE;
 226                 int j;
 227
 228                 cpus_and(nodemask, nodemask, *cpu_map);
 229                 if (cpus_empty(nodemask))
 230                         continue;
 231
 232                 domainspan = sched_domain_node_span(i);
 233                 cpus_and(domainspan, domainspan, *cpu_map);
 234
 235                 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
 236                 sched_group_nodes[i] = sg;
 237                 for_each_cpu_mask(j, nodemask) {
 238                         struct sched_domain *sd;
 239                         sd = &per_cpu(node_domains, j);
 240                         sd->groups = sg;
 241                         if (sd->groups == NULL) {
 242                                 /* Turn off balancing if we have no groups */
 243                                 sd->flags = 0;
 244                         }
 245                 }
 246                 if (!sg) {
 247                         printk(KERN_WARNING
 248                         "Can not alloc domain group for node %d\n", i);
 249                         continue;
 250                 }
 251                 sg->cpu_power = 0;
 252                 sg->cpumask = nodemask;
 253                 cpus_or(covered, covered, nodemask);
 254                 prev = sg;
 255
 256                 for (j = 0; j < MAX_NUMNODES; j++) {
 257                         cpumask_t tmp, notcovered;
 258                         int n = (i + j) % MAX_NUMNODES;
 259
 260                         cpus_complement(notcovered, covered);
 261                         cpus_and(tmp, notcovered, *cpu_map);
 262                         cpus_and(tmp, tmp, domainspan);
 263                         if (cpus_empty(tmp))
 264                                 break;
 265
 266                         nodemask = node_to_cpumask(n);
 267                         cpus_and(tmp, tmp, nodemask);
 268                         if (cpus_empty(tmp))
 269                                 continue;
 270
 271                         sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
 272                         if (!sg) {
 273                                 printk(KERN_WARNING
 274                                 "Can not alloc domain group for node %d\n", j);
 275                                 break;
 276                         }
 277                         sg->cpu_power = 0;
 278                         sg->cpumask = tmp;
 279                         cpus_or(covered, covered, tmp);
 280                         prev->next = sg;
 281                         prev = sg;
 282                 }
 283                 prev->next = sched_group_nodes[i];
 284         }
 285 #endif
 286
 287         /* Calculate CPU power for physical packages and nodes */
 288         for_each_cpu_mask(i, *cpu_map) {
 289                 int power;
 290                 struct sched_domain *sd;
 291 #ifdef CONFIG_SCHED_SMT
 292                 sd = &per_cpu(cpu_domains, i);
 293                 power = SCHED_LOAD_SCALE;
 294                 sd->groups->cpu_power = power;
 295 #endif
 296
 297                 sd = &per_cpu(phys_domains, i);
 298                 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 299                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
 300                 sd->groups->cpu_power = power;
 301
 302 #ifdef CONFIG_NUMA
 303                 sd = &per_cpu(allnodes_domains, i);
 304                 if (sd->groups) {
 305                         power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 306                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
 307                         sd->groups->cpu_power = power;
 308                 }
 309 #endif
 310         }
 311
 312 #ifdef CONFIG_NUMA
 313         for (i = 0; i < MAX_NUMNODES; i++) {
 314                 struct sched_group *sg = sched_group_nodes[i];
 315                 int j;
 316
 317                 if (sg == NULL)
 318                         continue;
 319 next_sg:
 320                 for_each_cpu_mask(j, sg->cpumask) {
 321                         struct sched_domain *sd;
 322                         int power;
 323
 324                         sd = &per_cpu(phys_domains, j);
 325                         if (j != first_cpu(sd->groups->cpumask)) {
 326                                 /*
 327                                  * Only add "power" once for each
 328                                  * physical package.
 329                                  */
 330                                 continue;
 331                         }
 332                         power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 333                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
 334
 335                         sg->cpu_power += power;
 336                 }
 337                 sg = sg->next;
 338                 if (sg != sched_group_nodes[i])
 339                         goto next_sg;
 340         }
 341 #endif
 342
 343         /* Attach the domains */
 344         for_each_online_cpu(i) {
 345                 struct sched_domain *sd;
 346 #ifdef CONFIG_SCHED_SMT
 347                 sd = &per_cpu(cpu_domains, i);
 348 #else
 349                 sd = &per_cpu(phys_domains, i);
 350 #endif
 351                 cpu_attach_domain(sd, i);
 352         }
 353 }
 354 /*
 355  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
 356  */
 357 void arch_init_sched_domains(const cpumask_t *cpu_map)
 358 {
 359         cpumask_t cpu_default_map;
 360
 361         /*
 362          * Setup mask for cpus without special case scheduling requirements.
 363          * For now this just excludes isolated cpus, but could be used to
 364          * exclude other special cases in the future.
 365          */
 366         cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
 367
 368         build_sched_domains(&cpu_default_map);
 369 }
 370
 371 void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 372 {
 373 #ifdef CONFIG_NUMA
 374         int i;
 375         for (i = 0; i < MAX_NUMNODES; i++) {
 376                 cpumask_t nodemask = node_to_cpumask(i);
 377                 struct sched_group *oldsg, *sg = sched_group_nodes[i];
 378
 379                 cpus_and(nodemask, nodemask, *cpu_map);
 380                 if (cpus_empty(nodemask))
 381                         continue;
 382
 383                 if (sg == NULL)
 384                         continue;
 385                 sg = sg->next;
 386 next_sg:
 387                 oldsg = sg;
 388                 sg = sg->next;
 389                 kfree(oldsg);
 390                 if (oldsg != sched_group_nodes[i])
 391                         goto next_sg;
 392                 sched_group_nodes[i] = NULL;
 393         }
 394 #endif
 395 }
 396