arch/x86/kernel/cpu/intel_rdt_rdtgroup.c

   1 /*
   2  * User interface for Resource Alloction in Resource Director Technology(RDT)
   3  *
   4  * Copyright (C) 2016 Intel Corporation
   5  *
   6  * Author: Fenghua Yu <fenghua.yu@intel.com>
   7  *
   8  * This program is free software; you can redistribute it and/or modify it
   9  * under the terms and conditions of the GNU General Public License,
  10  * version 2, as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15  * more details.
  16  *
  17  * More information about RDT be found in the Intel (R) x86 Architecture
  18  * Software Developer Manual.
  19  */
  20
  21 #define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  22
  23 #include <linux/cacheinfo.h>
  24 #include <linux/cpu.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/fs.h>
  27 #include <linux/sysfs.h>
  28 #include <linux/kernfs.h>
  29 #include <linux/seq_buf.h>
  30 #include <linux/seq_file.h>
  31 #include <linux/sched/signal.h>
  32 #include <linux/sched/task.h>
  33 #include <linux/slab.h>
  34 #include <linux/task_work.h>
  35
  36 #include <uapi/linux/magic.h>
  37
  38 #include <asm/intel_rdt_sched.h>
  39 #include "intel_rdt.h"
  40
  41 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
  42 DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  43 DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
  44 static struct kernfs_root *rdt_root;
  45 struct rdtgroup rdtgroup_default;
  46 LIST_HEAD(rdt_all_groups);
  47
  48 /* Kernel fs node for "info" directory under root */
  49 static struct kernfs_node *kn_info;
  50
  51 /* Kernel fs node for "mon_groups" directory under root */
  52 static struct kernfs_node *kn_mongrp;
  53
  54 /* Kernel fs node for "mon_data" directory under root */
  55 static struct kernfs_node *kn_mondata;
  56
  57 static struct seq_buf last_cmd_status;
  58 static char last_cmd_status_buf[512];
  59
  60 struct dentry *debugfs_resctrl;
  61
  62 void rdt_last_cmd_clear(void)
  63 {
  64         lockdep_assert_held(&rdtgroup_mutex);
  65         seq_buf_clear(&last_cmd_status);
  66 }
  67
  68 void rdt_last_cmd_puts(const char *s)
  69 {
  70         lockdep_assert_held(&rdtgroup_mutex);
  71         seq_buf_puts(&last_cmd_status, s);
  72 }
  73
  74 void rdt_last_cmd_printf(const char *fmt, ...)
  75 {
  76         va_list ap;
  77
  78         va_start(ap, fmt);
  79         lockdep_assert_held(&rdtgroup_mutex);
  80         seq_buf_vprintf(&last_cmd_status, fmt, ap);
  81         va_end(ap);
  82 }
  83
  84 /*
  85  * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
  86  * we can keep a bitmap of free CLOSIDs in a single integer.
  87  *
  88  * Using a global CLOSID across all resources has some advantages and
  89  * some drawbacks:
  90  * + We can simply set "current->closid" to assign a task to a resource
  91  *   group.
  92  * + Context switch code can avoid extra memory references deciding which
  93  *   CLOSID to load into the PQR_ASSOC MSR
  94  * - We give up some options in configuring resource groups across multi-socket
  95  *   systems.
  96  * - Our choices on how to configure each resource become progressively more
  97  *   limited as the number of resources grows.
  98  */
  99 static int closid_free_map;
 100 static int closid_free_map_len;
 101
 102 int closids_supported(void)
 103 {
 104         return closid_free_map_len;
 105 }
 106
 107 static void closid_init(void)
 108 {
 109         struct rdt_resource *r;
 110         int rdt_min_closid = 32;
 111
 112         /* Compute rdt_min_closid across all resources */
 113         for_each_alloc_enabled_rdt_resource(r)
 114                 rdt_min_closid = min(rdt_min_closid, r->num_closid);
 115
 116         closid_free_map = BIT_MASK(rdt_min_closid) - 1;
 117
 118         /* CLOSID 0 is always reserved for the default group */
 119         closid_free_map &= ~1;
 120         closid_free_map_len = rdt_min_closid;
 121 }
 122
 123 static int closid_alloc(void)
 124 {
 125         u32 closid = ffs(closid_free_map);
 126
 127         if (closid == 0)
 128                 return -ENOSPC;
 129         closid--;
 130         closid_free_map &= ~(1 << closid);
 131
 132         return closid;
 133 }
 134
 135 void closid_free(int closid)
 136 {
 137         closid_free_map |= 1 << closid;
 138 }
 139
 140 /**
 141  * closid_allocated - test if provided closid is in use
 142  * @closid: closid to be tested
 143  *
 144  * Return: true if @closid is currently associated with a resource group,
 145  * false if @closid is free
 146  */
 147 static bool closid_allocated(unsigned int closid)
 148 {
 149         return (closid_free_map & (1 << closid)) == 0;
 150 }
 151
 152 /**
 153  * rdtgroup_mode_by_closid - Return mode of resource group with closid
 154  * @closid: closid if the resource group
 155  *
 156  * Each resource group is associated with a @closid. Here the mode
 157  * of a resource group can be queried by searching for it using its closid.
 158  *
 159  * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
 160  */
 161 enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
 162 {
 163         struct rdtgroup *rdtgrp;
 164
 165         list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
 166                 if (rdtgrp->closid == closid)
 167                         return rdtgrp->mode;
 168         }
 169
 170         return RDT_NUM_MODES;
 171 }
 172
 173 static const char * const rdt_mode_str[] = {
 174         [RDT_MODE_SHAREABLE]            = "shareable",
 175         [RDT_MODE_EXCLUSIVE]            = "exclusive",
 176         [RDT_MODE_PSEUDO_LOCKSETUP]     = "pseudo-locksetup",
 177         [RDT_MODE_PSEUDO_LOCKED]        = "pseudo-locked",
 178 };
 179
 180 /**
 181  * rdtgroup_mode_str - Return the string representation of mode
 182  * @mode: the resource group mode as &enum rdtgroup_mode
 183  *
 184  * Return: string representation of valid mode, "unknown" otherwise
 185  */
 186 static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
 187 {
 188         if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
 189                 return "unknown";
 190
 191         return rdt_mode_str[mode];
 192 }
 193
 194 /* set uid and gid of rdtgroup dirs and files to that of the creator */
 195 static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
 196 {
 197         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
 198                                 .ia_uid = current_fsuid(),
 199                                 .ia_gid = current_fsgid(), };
 200
 201         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
 202             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
 203                 return 0;
 204
 205         return kernfs_setattr(kn, &iattr);
 206 }
 207
 208 static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
 209 {
 210         struct kernfs_node *kn;
 211         int ret;
 212
 213         kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
 214                                   GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
 215                                   0, rft->kf_ops, rft, NULL, NULL);
 216         if (IS_ERR(kn))
 217                 return PTR_ERR(kn);
 218
 219         ret = rdtgroup_kn_set_ugid(kn);
 220         if (ret) {
 221                 kernfs_remove(kn);
 222                 return ret;
 223         }
 224
 225         return 0;
 226 }
 227
 228 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
 229 {
 230         struct kernfs_open_file *of = m->private;
 231         struct rftype *rft = of->kn->priv;
 232
 233         if (rft->seq_show)
 234                 return rft->seq_show(of, m, arg);
 235         return 0;
 236 }
 237
 238 static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
 239                                    size_t nbytes, loff_t off)
 240 {
 241         struct rftype *rft = of->kn->priv;
 242
 243         if (rft->write)
 244                 return rft->write(of, buf, nbytes, off);
 245
 246         return -EINVAL;
 247 }
 248
 249 static struct kernfs_ops rdtgroup_kf_single_ops = {
 250         .atomic_write_len       = PAGE_SIZE,
 251         .write                  = rdtgroup_file_write,
 252         .seq_show               = rdtgroup_seqfile_show,
 253 };
 254
 255 static struct kernfs_ops kf_mondata_ops = {
 256         .atomic_write_len       = PAGE_SIZE,
 257         .seq_show               = rdtgroup_mondata_show,
 258 };
 259
 260 static bool is_cpu_list(struct kernfs_open_file *of)
 261 {
 262         struct rftype *rft = of->kn->priv;
 263
 264         return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
 265 }
 266
 267 static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 268                               struct seq_file *s, void *v)
 269 {
 270         struct rdtgroup *rdtgrp;
 271         int ret = 0;
 272
 273         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 274
 275         if (rdtgrp) {
 276                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
 277                         seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
 278                                    cpumask_pr_args(&rdtgrp->plr->d->cpu_mask));
 279                 else
 280                         seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
 281                                    cpumask_pr_args(&rdtgrp->cpu_mask));
 282         } else {
 283                 ret = -ENOENT;
 284         }
 285         rdtgroup_kn_unlock(of->kn);
 286
 287         return ret;
 288 }
 289
 290 /*
 291  * This is safe against intel_rdt_sched_in() called from __switch_to()
 292  * because __switch_to() is executed with interrupts disabled. A local call
 293  * from update_closid_rmid() is proteced against __switch_to() because
 294  * preemption is disabled.
 295  */
 296 static void update_cpu_closid_rmid(void *info)
 297 {
 298         struct rdtgroup *r = info;
 299
 300         if (r) {
 301                 this_cpu_write(pqr_state.default_closid, r->closid);
 302                 this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
 303         }
 304
 305         /*
 306          * We cannot unconditionally write the MSR because the current
 307          * executing task might have its own closid selected. Just reuse
 308          * the context switch code.
 309          */
 310         intel_rdt_sched_in();
 311 }
 312
 313 /*
 314  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
 315  *
 316  * Per task closids/rmids must have been set up before calling this function.
 317  */
 318 static void
 319 update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 320 {
 321         int cpu = get_cpu();
 322
 323         if (cpumask_test_cpu(cpu, cpu_mask))
 324                 update_cpu_closid_rmid(r);
 325         smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
 326         put_cpu();
 327 }
 328
 329 static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 330                           cpumask_var_t tmpmask)
 331 {
 332         struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
 333         struct list_head *head;
 334
 335         /* Check whether cpus belong to parent ctrl group */
 336         cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
 337         if (cpumask_weight(tmpmask)) {
 338                 rdt_last_cmd_puts("can only add CPUs to mongroup that belong to parent\n");
 339                 return -EINVAL;
 340         }
 341
 342         /* Check whether cpus are dropped from this group */
 343         cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 344         if (cpumask_weight(tmpmask)) {
 345                 /* Give any dropped cpus to parent rdtgroup */
 346                 cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
 347                 update_closid_rmid(tmpmask, prgrp);
 348         }
 349
 350         /*
 351          * If we added cpus, remove them from previous group that owned them
 352          * and update per-cpu rmid
 353          */
 354         cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 355         if (cpumask_weight(tmpmask)) {
 356                 head = &prgrp->mon.crdtgrp_list;
 357                 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 358                         if (crgrp == rdtgrp)
 359                                 continue;
 360                         cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
 361                                        tmpmask);
 362                 }
 363                 update_closid_rmid(tmpmask, rdtgrp);
 364         }
 365
 366         /* Done pushing/pulling - update this group with new mask */
 367         cpumask_copy(&rdtgrp->cpu_mask, newmask);
 368
 369         return 0;
 370 }
 371
 372 static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
 373 {
 374         struct rdtgroup *crgrp;
 375
 376         cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
 377         /* update the child mon group masks as well*/
 378         list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
 379                 cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
 380 }
 381
 382 static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 383                            cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
 384 {
 385         struct rdtgroup *r, *crgrp;
 386         struct list_head *head;
 387
 388         /* Check whether cpus are dropped from this group */
 389         cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 390         if (cpumask_weight(tmpmask)) {
 391                 /* Can't drop from default group */
 392                 if (rdtgrp == &rdtgroup_default) {
 393                         rdt_last_cmd_puts("Can't drop CPUs from default group\n");
 394                         return -EINVAL;
 395                 }
 396
 397                 /* Give any dropped cpus to rdtgroup_default */
 398                 cpumask_or(&rdtgroup_default.cpu_mask,
 399                            &rdtgroup_default.cpu_mask, tmpmask);
 400                 update_closid_rmid(tmpmask, &rdtgroup_default);
 401         }
 402
 403         /*
 404          * If we added cpus, remove them from previous group and
 405          * the prev group's child groups that owned them
 406          * and update per-cpu closid/rmid.
 407          */
 408         cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 409         if (cpumask_weight(tmpmask)) {
 410                 list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
 411                         if (r == rdtgrp)
 412                                 continue;
 413                         cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
 414                         if (cpumask_weight(tmpmask1))
 415                                 cpumask_rdtgrp_clear(r, tmpmask1);
 416                 }
 417                 update_closid_rmid(tmpmask, rdtgrp);
 418         }
 419
 420         /* Done pushing/pulling - update this group with new mask */
 421         cpumask_copy(&rdtgrp->cpu_mask, newmask);
 422
 423         /*
 424          * Clear child mon group masks since there is a new parent mask
 425          * now and update the rmid for the cpus the child lost.
 426          */
 427         head = &rdtgrp->mon.crdtgrp_list;
 428         list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 429                 cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
 430                 update_closid_rmid(tmpmask, rdtgrp);
 431                 cpumask_clear(&crgrp->cpu_mask);
 432         }
 433
 434         return 0;
 435 }
 436
 437 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 438                                    char *buf, size_t nbytes, loff_t off)
 439 {
 440         cpumask_var_t tmpmask, newmask, tmpmask1;
 441         struct rdtgroup *rdtgrp;
 442         int ret;
 443
 444         if (!buf)
 445                 return -EINVAL;
 446
 447         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 448                 return -ENOMEM;
 449         if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
 450                 free_cpumask_var(tmpmask);
 451                 return -ENOMEM;
 452         }
 453         if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
 454                 free_cpumask_var(tmpmask);
 455                 free_cpumask_var(newmask);
 456                 return -ENOMEM;
 457         }
 458
 459         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 460         rdt_last_cmd_clear();
 461         if (!rdtgrp) {
 462                 ret = -ENOENT;
 463                 rdt_last_cmd_puts("directory was removed\n");
 464                 goto unlock;
 465         }
 466
 467         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 468             rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 469                 ret = -EINVAL;
 470                 rdt_last_cmd_puts("pseudo-locking in progress\n");
 471                 goto unlock;
 472         }
 473
 474         if (is_cpu_list(of))
 475                 ret = cpulist_parse(buf, newmask);
 476         else
 477                 ret = cpumask_parse(buf, newmask);
 478
 479         if (ret) {
 480                 rdt_last_cmd_puts("bad cpu list/mask\n");
 481                 goto unlock;
 482         }
 483
 484         /* check that user didn't specify any offline cpus */
 485         cpumask_andnot(tmpmask, newmask, cpu_online_mask);
 486         if (cpumask_weight(tmpmask)) {
 487                 ret = -EINVAL;
 488                 rdt_last_cmd_puts("can only assign online cpus\n");
 489                 goto unlock;
 490         }
 491
 492         if (rdtgrp->type == RDTCTRL_GROUP)
 493                 ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
 494         else if (rdtgrp->type == RDTMON_GROUP)
 495                 ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
 496         else
 497                 ret = -EINVAL;
 498
 499 unlock:
 500         rdtgroup_kn_unlock(of->kn);
 501         free_cpumask_var(tmpmask);
 502         free_cpumask_var(newmask);
 503         free_cpumask_var(tmpmask1);
 504
 505         return ret ?: nbytes;
 506 }
 507
 508 struct task_move_callback {
 509         struct callback_head    work;
 510         struct rdtgroup         *rdtgrp;
 511 };
 512
 513 static void move_myself(struct callback_head *head)
 514 {
 515         struct task_move_callback *callback;
 516         struct rdtgroup *rdtgrp;
 517
 518         callback = container_of(head, struct task_move_callback, work);
 519         rdtgrp = callback->rdtgrp;
 520
 521         /*
 522          * If resource group was deleted before this task work callback
 523          * was invoked, then assign the task to root group and free the
 524          * resource group.
 525          */
 526         if (atomic_dec_and_test(&rdtgrp->waitcount) &&
 527             (rdtgrp->flags & RDT_DELETED)) {
 528                 current->closid = 0;
 529                 current->rmid = 0;
 530                 kfree(rdtgrp);
 531         }
 532
 533         preempt_disable();
 534         /* update PQR_ASSOC MSR to make resource group go into effect */
 535         intel_rdt_sched_in();
 536         preempt_enable();
 537
 538         kfree(callback);
 539 }
 540
 541 static int __rdtgroup_move_task(struct task_struct *tsk,
 542                                 struct rdtgroup *rdtgrp)
 543 {
 544         struct task_move_callback *callback;
 545         int ret;
 546
 547         callback = kzalloc(sizeof(*callback), GFP_KERNEL);
 548         if (!callback)
 549                 return -ENOMEM;
 550         callback->work.func = move_myself;
 551         callback->rdtgrp = rdtgrp;
 552
 553         /*
 554          * Take a refcount, so rdtgrp cannot be freed before the
 555          * callback has been invoked.
 556          */
 557         atomic_inc(&rdtgrp->waitcount);
 558         ret = task_work_add(tsk, &callback->work, true);
 559         if (ret) {
 560                 /*
 561                  * Task is exiting. Drop the refcount and free the callback.
 562                  * No need to check the refcount as the group cannot be
 563                  * deleted before the write function unlocks rdtgroup_mutex.
 564                  */
 565                 atomic_dec(&rdtgrp->waitcount);
 566                 kfree(callback);
 567                 rdt_last_cmd_puts("task exited\n");
 568         } else {
 569                 /*
 570                  * For ctrl_mon groups move both closid and rmid.
 571                  * For monitor groups, can move the tasks only from
 572                  * their parent CTRL group.
 573                  */
 574                 if (rdtgrp->type == RDTCTRL_GROUP) {
 575                         tsk->closid = rdtgrp->closid;
 576                         tsk->rmid = rdtgrp->mon.rmid;
 577                 } else if (rdtgrp->type == RDTMON_GROUP) {
 578                         if (rdtgrp->mon.parent->closid == tsk->closid) {
 579                                 tsk->rmid = rdtgrp->mon.rmid;
 580                         } else {
 581                                 rdt_last_cmd_puts("Can't move task to different control group\n");
 582                                 ret = -EINVAL;
 583                         }
 584                 }
 585         }
 586         return ret;
 587 }
 588
 589 /**
 590  * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
 591  * @r: Resource group
 592  *
 593  * Return: 1 if tasks have been assigned to @r, 0 otherwise
 594  */
 595 int rdtgroup_tasks_assigned(struct rdtgroup *r)
 596 {
 597         struct task_struct *p, *t;
 598         int ret = 0;
 599
 600         lockdep_assert_held(&rdtgroup_mutex);
 601
 602         rcu_read_lock();
 603         for_each_process_thread(p, t) {
 604                 if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
 605                     (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
 606                         ret = 1;
 607                         break;
 608                 }
 609         }
 610         rcu_read_unlock();
 611
 612         return ret;
 613 }
 614
 615 static int rdtgroup_task_write_permission(struct task_struct *task,
 616                                           struct kernfs_open_file *of)
 617 {
 618         const struct cred *tcred = get_task_cred(task);
 619         const struct cred *cred = current_cred();
 620         int ret = 0;
 621
 622         /*
 623          * Even if we're attaching all tasks in the thread group, we only
 624          * need to check permissions on one of them.
 625          */
 626         if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 627             !uid_eq(cred->euid, tcred->uid) &&
 628             !uid_eq(cred->euid, tcred->suid)) {
 629                 rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
 630                 ret = -EPERM;
 631         }
 632
 633         put_cred(tcred);
 634         return ret;
 635 }
 636
 637 static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
 638                               struct kernfs_open_file *of)
 639 {
 640         struct task_struct *tsk;
 641         int ret;
 642
 643         rcu_read_lock();
 644         if (pid) {
 645                 tsk = find_task_by_vpid(pid);
 646                 if (!tsk) {
 647                         rcu_read_unlock();
 648                         rdt_last_cmd_printf("No task %d\n", pid);
 649                         return -ESRCH;
 650                 }
 651         } else {
 652                 tsk = current;
 653         }
 654
 655         get_task_struct(tsk);
 656         rcu_read_unlock();
 657
 658         ret = rdtgroup_task_write_permission(tsk, of);
 659         if (!ret)
 660                 ret = __rdtgroup_move_task(tsk, rdtgrp);
 661
 662         put_task_struct(tsk);
 663         return ret;
 664 }
 665
 666 static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
 667                                     char *buf, size_t nbytes, loff_t off)
 668 {
 669         struct rdtgroup *rdtgrp;
 670         int ret = 0;
 671         pid_t pid;
 672
 673         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 674                 return -EINVAL;
 675         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 676         if (!rdtgrp) {
 677                 rdtgroup_kn_unlock(of->kn);
 678                 return -ENOENT;
 679         }
 680         rdt_last_cmd_clear();
 681
 682         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 683             rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 684                 ret = -EINVAL;
 685                 rdt_last_cmd_puts("pseudo-locking in progress\n");
 686                 goto unlock;
 687         }
 688
 689         ret = rdtgroup_move_task(pid, rdtgrp, of);
 690
 691 unlock:
 692         rdtgroup_kn_unlock(of->kn);
 693
 694         return ret ?: nbytes;
 695 }
 696
 697 static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 698 {
 699         struct task_struct *p, *t;
 700
 701         rcu_read_lock();
 702         for_each_process_thread(p, t) {
 703                 if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
 704                     (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
 705                         seq_printf(s, "%d\n", t->pid);
 706         }
 707         rcu_read_unlock();
 708 }
 709
 710 static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 711                                struct seq_file *s, void *v)
 712 {
 713         struct rdtgroup *rdtgrp;
 714         int ret = 0;
 715
 716         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 717         if (rdtgrp)
 718                 show_rdt_tasks(rdtgrp, s);
 719         else
 720                 ret = -ENOENT;
 721         rdtgroup_kn_unlock(of->kn);
 722
 723         return ret;
 724 }
 725
 726 static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 727                                     struct seq_file *seq, void *v)
 728 {
 729         int len;
 730
 731         mutex_lock(&rdtgroup_mutex);
 732         len = seq_buf_used(&last_cmd_status);
 733         if (len)
 734                 seq_printf(seq, "%.*s", len, last_cmd_status_buf);
 735         else
 736                 seq_puts(seq, "ok\n");
 737         mutex_unlock(&rdtgroup_mutex);
 738         return 0;
 739 }
 740
 741 static int rdt_num_closids_show(struct kernfs_open_file *of,
 742                                 struct seq_file *seq, void *v)
 743 {
 744         struct rdt_resource *r = of->kn->parent->priv;
 745
 746         seq_printf(seq, "%d\n", r->num_closid);
 747         return 0;
 748 }
 749
 750 static int rdt_default_ctrl_show(struct kernfs_open_file *of,
 751                              struct seq_file *seq, void *v)
 752 {
 753         struct rdt_resource *r = of->kn->parent->priv;
 754
 755         seq_printf(seq, "%x\n", r->default_ctrl);
 756         return 0;
 757 }
 758
 759 static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 760                              struct seq_file *seq, void *v)
 761 {
 762         struct rdt_resource *r = of->kn->parent->priv;
 763
 764         seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
 765         return 0;
 766 }
 767
 768 static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 769                                    struct seq_file *seq, void *v)
 770 {
 771         struct rdt_resource *r = of->kn->parent->priv;
 772
 773         seq_printf(seq, "%x\n", r->cache.shareable_bits);
 774         return 0;
 775 }
 776
 777 /**
 778  * rdt_bit_usage_show - Display current usage of resources
 779  *
 780  * A domain is a shared resource that can now be allocated differently. Here
 781  * we display the current regions of the domain as an annotated bitmask.
 782  * For each domain of this resource its allocation bitmask
 783  * is annotated as below to indicate the current usage of the corresponding bit:
 784  *   0 - currently unused
 785  *   X - currently available for sharing and used by software and hardware
 786  *   H - currently used by hardware only but available for software use
 787  *   S - currently used and shareable by software only
 788  *   E - currently used exclusively by one resource group
 789  *   P - currently pseudo-locked by one resource group
 790  */
 791 static int rdt_bit_usage_show(struct kernfs_open_file *of,
 792                               struct seq_file *seq, void *v)
 793 {
 794         struct rdt_resource *r = of->kn->parent->priv;
 795         u32 sw_shareable = 0, hw_shareable = 0;
 796         u32 exclusive = 0, pseudo_locked = 0;
 797         struct rdt_domain *dom;
 798         int i, hwb, swb, excl, psl;
 799         enum rdtgrp_mode mode;
 800         bool sep = false;
 801         u32 *ctrl;
 802
 803         mutex_lock(&rdtgroup_mutex);
 804         hw_shareable = r->cache.shareable_bits;
 805         list_for_each_entry(dom, &r->domains, list) {
 806                 if (sep)
 807                         seq_putc(seq, ';');
 808                 ctrl = dom->ctrl_val;
 809                 sw_shareable = 0;
 810                 exclusive = 0;
 811                 seq_printf(seq, "%d=", dom->id);
 812                 for (i = 0; i < closids_supported(); i++, ctrl++) {
 813                         if (!closid_allocated(i))
 814                                 continue;
 815                         mode = rdtgroup_mode_by_closid(i);
 816                         switch (mode) {
 817                         case RDT_MODE_SHAREABLE:
 818                                 sw_shareable |= *ctrl;
 819                                 break;
 820                         case RDT_MODE_EXCLUSIVE:
 821                                 exclusive |= *ctrl;
 822                                 break;
 823                         case RDT_MODE_PSEUDO_LOCKSETUP:
 824                         /*
 825                          * RDT_MODE_PSEUDO_LOCKSETUP is possible
 826                          * here but not included since the CBM
 827                          * associated with this CLOSID in this mode
 828                          * is not initialized and no task or cpu can be
 829                          * assigned this CLOSID.
 830                          */
 831                                 break;
 832                         case RDT_MODE_PSEUDO_LOCKED:
 833                         case RDT_NUM_MODES:
 834                                 WARN(1,
 835                                      "invalid mode for closid %d\n", i);
 836                                 break;
 837                         }
 838                 }
 839                 for (i = r->cache.cbm_len - 1; i >= 0; i--) {
 840                         pseudo_locked = dom->plr ? dom->plr->cbm : 0;
 841                         hwb = test_bit(i, (unsigned long *)&hw_shareable);
 842                         swb = test_bit(i, (unsigned long *)&sw_shareable);
 843                         excl = test_bit(i, (unsigned long *)&exclusive);
 844                         psl = test_bit(i, (unsigned long *)&pseudo_locked);
 845                         if (hwb && swb)
 846                                 seq_putc(seq, 'X');
 847                         else if (hwb && !swb)
 848                                 seq_putc(seq, 'H');
 849                         else if (!hwb && swb)
 850                                 seq_putc(seq, 'S');
 851                         else if (excl)
 852                                 seq_putc(seq, 'E');
 853                         else if (psl)
 854                                 seq_putc(seq, 'P');
 855                         else /* Unused bits remain */
 856                                 seq_putc(seq, '0');
 857                 }
 858                 sep = true;
 859         }
 860         seq_putc(seq, '\n');
 861         mutex_unlock(&rdtgroup_mutex);
 862         return 0;
 863 }
 864
 865 static int rdt_min_bw_show(struct kernfs_open_file *of,
 866                              struct seq_file *seq, void *v)
 867 {
 868         struct rdt_resource *r = of->kn->parent->priv;
 869
 870         seq_printf(seq, "%u\n", r->membw.min_bw);
 871         return 0;
 872 }
 873
 874 static int rdt_num_rmids_show(struct kernfs_open_file *of,
 875                               struct seq_file *seq, void *v)
 876 {
 877         struct rdt_resource *r = of->kn->parent->priv;
 878
 879         seq_printf(seq, "%d\n", r->num_rmid);
 880
 881         return 0;
 882 }
 883
 884 static int rdt_mon_features_show(struct kernfs_open_file *of,
 885                                  struct seq_file *seq, void *v)
 886 {
 887         struct rdt_resource *r = of->kn->parent->priv;
 888         struct mon_evt *mevt;
 889
 890         list_for_each_entry(mevt, &r->evt_list, list)
 891                 seq_printf(seq, "%s\n", mevt->name);
 892
 893         return 0;
 894 }
 895
 896 static int rdt_bw_gran_show(struct kernfs_open_file *of,
 897                              struct seq_file *seq, void *v)
 898 {
 899         struct rdt_resource *r = of->kn->parent->priv;
 900
 901         seq_printf(seq, "%u\n", r->membw.bw_gran);
 902         return 0;
 903 }
 904
 905 static int rdt_delay_linear_show(struct kernfs_open_file *of,
 906                              struct seq_file *seq, void *v)
 907 {
 908         struct rdt_resource *r = of->kn->parent->priv;
 909
 910         seq_printf(seq, "%u\n", r->membw.delay_linear);
 911         return 0;
 912 }
 913
 914 static int max_threshold_occ_show(struct kernfs_open_file *of,
 915                                   struct seq_file *seq, void *v)
 916 {
 917         struct rdt_resource *r = of->kn->parent->priv;
 918
 919         seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
 920
 921         return 0;
 922 }
 923
 924 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 925                                        char *buf, size_t nbytes, loff_t off)
 926 {
 927         struct rdt_resource *r = of->kn->parent->priv;
 928         unsigned int bytes;
 929         int ret;
 930
 931         ret = kstrtouint(buf, 0, &bytes);
 932         if (ret)
 933                 return ret;
 934
 935         if (bytes > (boot_cpu_data.x86_cache_size * 1024))
 936                 return -EINVAL;
 937
 938         intel_cqm_threshold = bytes / r->mon_scale;
 939
 940         return nbytes;
 941 }
 942
 943 /*
 944  * rdtgroup_mode_show - Display mode of this resource group
 945  */
 946 static int rdtgroup_mode_show(struct kernfs_open_file *of,
 947                               struct seq_file *s, void *v)
 948 {
 949         struct rdtgroup *rdtgrp;
 950
 951         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 952         if (!rdtgrp) {
 953                 rdtgroup_kn_unlock(of->kn);
 954                 return -ENOENT;
 955         }
 956
 957         seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
 958
 959         rdtgroup_kn_unlock(of->kn);
 960         return 0;
 961 }
 962
 963 /**
 964  * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
 965  * @r: Resource to which domain instance @d belongs.
 966  * @d: The domain instance for which @closid is being tested.
 967  * @cbm: Capacity bitmask being tested.
 968  * @closid: Intended closid for @cbm.
 969  * @exclusive: Only check if overlaps with exclusive resource groups
 970  *
 971  * Checks if provided @cbm intended to be used for @closid on domain
 972  * @d overlaps with any other closids or other hardware usage associated
 973  * with this domain. If @exclusive is true then only overlaps with
 974  * resource groups in exclusive mode will be considered. If @exclusive
 975  * is false then overlaps with any resource group or hardware entities
 976  * will be considered.
 977  *
 978  * @cbm is unsigned long, even if only 32 bits are used, to make the
 979  * bitmap functions work correctly.
 980  *
 981  * Return: false if CBM does not overlap, true if it does.
 982  */
 983 bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
 984                            unsigned long cbm, int closid, bool exclusive)
 985 {
 986         enum rdtgrp_mode mode;
 987         unsigned long ctrl_b;
 988         u32 *ctrl;
 989         int i;
 990
 991         /* Check for any overlap with regions used by hardware directly */
 992         if (!exclusive) {
 993                 ctrl_b = r->cache.shareable_bits;
 994                 if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
 995                         return true;
 996         }
 997
 998         /* Check for overlap with other resource groups */
 999         ctrl = d->ctrl_val;
1000         for (i = 0; i < closids_supported(); i++, ctrl++) {
1001                 ctrl_b = *ctrl;
1002                 mode = rdtgroup_mode_by_closid(i);
1003                 if (closid_allocated(i) && i != closid &&
1004                     mode != RDT_MODE_PSEUDO_LOCKSETUP) {
1005                         if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
1006                                 if (exclusive) {
1007                                         if (mode == RDT_MODE_EXCLUSIVE)
1008                                                 return true;
1009                                         continue;
1010                                 }
1011                                 return true;
1012                         }
1013                 }
1014         }
1015
1016         return false;
1017 }
1018
1019 /**
1020  * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
1021  *
1022  * An exclusive resource group implies that there should be no sharing of
1023  * its allocated resources. At the time this group is considered to be
1024  * exclusive this test can determine if its current schemata supports this
1025  * setting by testing for overlap with all other resource groups.
1026  *
1027  * Return: true if resource group can be exclusive, false if there is overlap
1028  * with allocations of other resource groups and thus this resource group
1029  * cannot be exclusive.
1030  */
1031 static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
1032 {
1033         int closid = rdtgrp->closid;
1034         struct rdt_resource *r;
1035         bool has_cache = false;
1036         struct rdt_domain *d;
1037
1038         for_each_alloc_enabled_rdt_resource(r) {
1039                 if (r->rid == RDT_RESOURCE_MBA)
1040                         continue;
1041                 has_cache = true;
1042                 list_for_each_entry(d, &r->domains, list) {
1043                         if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
1044                                                   rdtgrp->closid, false)) {
1045                                 rdt_last_cmd_puts("schemata overlaps\n");
1046                                 return false;
1047                         }
1048                 }
1049         }
1050
1051         if (!has_cache) {
1052                 rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n");
1053                 return false;
1054         }
1055
1056         return true;
1057 }
1058
1059 /**
1060  * rdtgroup_mode_write - Modify the resource group's mode
1061  *
1062  */
1063 static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
1064                                    char *buf, size_t nbytes, loff_t off)
1065 {
1066         struct rdtgroup *rdtgrp;
1067         enum rdtgrp_mode mode;
1068         int ret = 0;
1069
1070         /* Valid input requires a trailing newline */
1071         if (nbytes == 0 || buf[nbytes - 1] != '\n')
1072                 return -EINVAL;
1073         buf[nbytes - 1] = '\0';
1074
1075         rdtgrp = rdtgroup_kn_lock_live(of->kn);
1076         if (!rdtgrp) {
1077                 rdtgroup_kn_unlock(of->kn);
1078                 return -ENOENT;
1079         }
1080
1081         rdt_last_cmd_clear();
1082
1083         mode = rdtgrp->mode;
1084
1085         if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
1086             (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
1087             (!strcmp(buf, "pseudo-locksetup") &&
1088              mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
1089             (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
1090                 goto out;
1091
1092         if (mode == RDT_MODE_PSEUDO_LOCKED) {
1093                 rdt_last_cmd_printf("cannot change pseudo-locked group\n");
1094                 ret = -EINVAL;
1095                 goto out;
1096         }
1097
1098         if (!strcmp(buf, "shareable")) {
1099                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1100                         ret = rdtgroup_locksetup_exit(rdtgrp);
1101                         if (ret)
1102                                 goto out;
1103                 }
1104                 rdtgrp->mode = RDT_MODE_SHAREABLE;
1105         } else if (!strcmp(buf, "exclusive")) {
1106                 if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
1107                         ret = -EINVAL;
1108                         goto out;
1109                 }
1110                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1111                         ret = rdtgroup_locksetup_exit(rdtgrp);
1112                         if (ret)
1113                                 goto out;
1114                 }
1115                 rdtgrp->mode = RDT_MODE_EXCLUSIVE;
1116         } else if (!strcmp(buf, "pseudo-locksetup")) {
1117                 ret = rdtgroup_locksetup_enter(rdtgrp);
1118                 if (ret)
1119                         goto out;
1120                 rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
1121         } else {
1122                 rdt_last_cmd_printf("unknown/unsupported mode\n");
1123                 ret = -EINVAL;
1124         }
1125
1126 out:
1127         rdtgroup_kn_unlock(of->kn);
1128         return ret ?: nbytes;
1129 }
1130
1131 /**
1132  * rdtgroup_cbm_to_size - Translate CBM to size in bytes
1133  * @r: RDT resource to which @d belongs.
1134  * @d: RDT domain instance.
1135  * @cbm: bitmask for which the size should be computed.
1136  *
1137  * The bitmask provided associated with the RDT domain instance @d will be
1138  * translated into how many bytes it represents. The size in bytes is
1139  * computed by first dividing the total cache size by the CBM length to
1140  * determine how many bytes each bit in the bitmask represents. The result
1141  * is multiplied with the number of bits set in the bitmask.
1142  *
1143  * @cbm is unsigned long, even if only 32 bits are used to make the
1144  * bitmap functions work correctly.
1145  */
1146 unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1147                                   struct rdt_domain *d, unsigned long cbm)
1148 {
1149         struct cpu_cacheinfo *ci;
1150         unsigned int size = 0;
1151         int num_b, i;
1152
1153         num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1154         ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
1155         for (i = 0; i < ci->num_leaves; i++) {
1156                 if (ci->info_list[i].level == r->cache_level) {
1157                         size = ci->info_list[i].size / r->cache.cbm_len * num_b;
1158                         break;
1159                 }
1160         }
1161
1162         return size;
1163 }
1164
1165 /**
1166  * rdtgroup_size_show - Display size in bytes of allocated regions
1167  *
1168  * The "size" file mirrors the layout of the "schemata" file, printing the
1169  * size in bytes of each region instead of the capacity bitmask.
1170  *
1171  */
1172 static int rdtgroup_size_show(struct kernfs_open_file *of,
1173                               struct seq_file *s, void *v)
1174 {
1175         struct rdtgroup *rdtgrp;
1176         struct rdt_resource *r;
1177         struct rdt_domain *d;
1178         unsigned int size;
1179         bool sep;
1180         u32 ctrl;
1181
1182         rdtgrp = rdtgroup_kn_lock_live(of->kn);
1183         if (!rdtgrp) {
1184                 rdtgroup_kn_unlock(of->kn);
1185                 return -ENOENT;
1186         }
1187
1188         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
1189                 seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name);
1190                 size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
1191                                             rdtgrp->plr->d,
1192                                             rdtgrp->plr->cbm);
1193                 seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
1194                 goto out;
1195         }
1196
1197         for_each_alloc_enabled_rdt_resource(r) {
1198                 sep = false;
1199                 seq_printf(s, "%*s:", max_name_width, r->name);
1200                 list_for_each_entry(d, &r->domains, list) {
1201                         if (sep)
1202                                 seq_putc(s, ';');
1203                         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1204                                 size = 0;
1205                         } else {
1206                                 ctrl = (!is_mba_sc(r) ?
1207                                                 d->ctrl_val[rdtgrp->closid] :
1208                                                 d->mbps_val[rdtgrp->closid]);
1209                                 if (r->rid == RDT_RESOURCE_MBA)
1210                                         size = ctrl;
1211                                 else
1212                                         size = rdtgroup_cbm_to_size(r, d, ctrl);
1213                         }
1214                         seq_printf(s, "%d=%u", d->id, size);
1215                         sep = true;
1216                 }
1217                 seq_putc(s, '\n');
1218         }
1219
1220 out:
1221         rdtgroup_kn_unlock(of->kn);
1222
1223         return 0;
1224 }
1225
1226 /* rdtgroup information files for one cache resource. */
1227 static struct rftype res_common_files[] = {
1228         {
1229                 .name           = "last_cmd_status",
1230                 .mode           = 0444,
1231                 .kf_ops         = &rdtgroup_kf_single_ops,
1232                 .seq_show       = rdt_last_cmd_status_show,
1233                 .fflags         = RF_TOP_INFO,
1234         },
1235         {
1236                 .name           = "num_closids",
1237                 .mode           = 0444,
1238                 .kf_ops         = &rdtgroup_kf_single_ops,
1239                 .seq_show       = rdt_num_closids_show,
1240                 .fflags         = RF_CTRL_INFO,
1241         },
1242         {
1243                 .name           = "mon_features",
1244                 .mode           = 0444,
1245                 .kf_ops         = &rdtgroup_kf_single_ops,
1246                 .seq_show       = rdt_mon_features_show,
1247                 .fflags         = RF_MON_INFO,
1248         },
1249         {
1250                 .name           = "num_rmids",
1251                 .mode           = 0444,
1252                 .kf_ops         = &rdtgroup_kf_single_ops,
1253                 .seq_show       = rdt_num_rmids_show,
1254                 .fflags         = RF_MON_INFO,
1255         },
1256         {
1257                 .name           = "cbm_mask",
1258                 .mode           = 0444,
1259                 .kf_ops         = &rdtgroup_kf_single_ops,
1260                 .seq_show       = rdt_default_ctrl_show,
1261                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1262         },
1263         {
1264                 .name           = "min_cbm_bits",
1265                 .mode           = 0444,
1266                 .kf_ops         = &rdtgroup_kf_single_ops,
1267                 .seq_show       = rdt_min_cbm_bits_show,
1268                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1269         },
1270         {
1271                 .name           = "shareable_bits",
1272                 .mode           = 0444,
1273                 .kf_ops         = &rdtgroup_kf_single_ops,
1274                 .seq_show       = rdt_shareable_bits_show,
1275                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1276         },
1277         {
1278                 .name           = "bit_usage",
1279                 .mode           = 0444,
1280                 .kf_ops         = &rdtgroup_kf_single_ops,
1281                 .seq_show       = rdt_bit_usage_show,
1282                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1283         },
1284         {
1285                 .name           = "min_bandwidth",
1286                 .mode           = 0444,
1287                 .kf_ops         = &rdtgroup_kf_single_ops,
1288                 .seq_show       = rdt_min_bw_show,
1289                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1290         },
1291         {
1292                 .name           = "bandwidth_gran",
1293                 .mode           = 0444,
1294                 .kf_ops         = &rdtgroup_kf_single_ops,
1295                 .seq_show       = rdt_bw_gran_show,
1296                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1297         },
1298         {
1299                 .name           = "delay_linear",
1300                 .mode           = 0444,
1301                 .kf_ops         = &rdtgroup_kf_single_ops,
1302                 .seq_show       = rdt_delay_linear_show,
1303                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1304         },
1305         {
1306                 .name           = "max_threshold_occupancy",
1307                 .mode           = 0644,
1308                 .kf_ops         = &rdtgroup_kf_single_ops,
1309                 .write          = max_threshold_occ_write,
1310                 .seq_show       = max_threshold_occ_show,
1311                 .fflags         = RF_MON_INFO | RFTYPE_RES_CACHE,
1312         },
1313         {
1314                 .name           = "cpus",
1315                 .mode           = 0644,
1316                 .kf_ops         = &rdtgroup_kf_single_ops,
1317                 .write          = rdtgroup_cpus_write,
1318                 .seq_show       = rdtgroup_cpus_show,
1319                 .fflags         = RFTYPE_BASE,
1320         },
1321         {
1322                 .name           = "cpus_list",
1323                 .mode           = 0644,
1324                 .kf_ops         = &rdtgroup_kf_single_ops,
1325                 .write          = rdtgroup_cpus_write,
1326                 .seq_show       = rdtgroup_cpus_show,
1327                 .flags          = RFTYPE_FLAGS_CPUS_LIST,
1328                 .fflags         = RFTYPE_BASE,
1329         },
1330         {
1331                 .name           = "tasks",
1332                 .mode           = 0644,
1333                 .kf_ops         = &rdtgroup_kf_single_ops,
1334                 .write          = rdtgroup_tasks_write,
1335                 .seq_show       = rdtgroup_tasks_show,
1336                 .fflags         = RFTYPE_BASE,
1337         },
1338         {
1339                 .name           = "schemata",
1340                 .mode           = 0644,
1341                 .kf_ops         = &rdtgroup_kf_single_ops,
1342                 .write          = rdtgroup_schemata_write,
1343                 .seq_show       = rdtgroup_schemata_show,
1344                 .fflags         = RF_CTRL_BASE,
1345         },
1346         {
1347                 .name           = "mode",
1348                 .mode           = 0644,
1349                 .kf_ops         = &rdtgroup_kf_single_ops,
1350                 .write          = rdtgroup_mode_write,
1351                 .seq_show       = rdtgroup_mode_show,
1352                 .fflags         = RF_CTRL_BASE,
1353         },
1354         {
1355                 .name           = "size",
1356                 .mode           = 0444,
1357                 .kf_ops         = &rdtgroup_kf_single_ops,
1358                 .seq_show       = rdtgroup_size_show,
1359                 .fflags         = RF_CTRL_BASE,
1360         },
1361
1362 };
1363
1364 static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
1365 {
1366         struct rftype *rfts, *rft;
1367         int ret, len;
1368
1369         rfts = res_common_files;
1370         len = ARRAY_SIZE(res_common_files);
1371
1372         lockdep_assert_held(&rdtgroup_mutex);
1373
1374         for (rft = rfts; rft < rfts + len; rft++) {
1375                 if ((fflags & rft->fflags) == rft->fflags) {
1376                         ret = rdtgroup_add_file(kn, rft);
1377                         if (ret)
1378                                 goto error;
1379                 }
1380         }
1381
1382         return 0;
1383 error:
1384         pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
1385         while (--rft >= rfts) {
1386                 if ((fflags & rft->fflags) == rft->fflags)
1387                         kernfs_remove_by_name(kn, rft->name);
1388         }
1389         return ret;
1390 }
1391
1392 /**
1393  * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
1394  * @r: The resource group with which the file is associated.
1395  * @name: Name of the file
1396  *
1397  * The permissions of named resctrl file, directory, or link are modified
1398  * to not allow read, write, or execute by any user.
1399  *
1400  * WARNING: This function is intended to communicate to the user that the
1401  * resctrl file has been locked down - that it is not relevant to the
1402  * particular state the system finds itself in. It should not be relied
1403  * on to protect from user access because after the file's permissions
1404  * are restricted the user can still change the permissions using chmod
1405  * from the command line.
1406  *
1407  * Return: 0 on success, <0 on failure.
1408  */
1409 int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
1410 {
1411         struct iattr iattr = {.ia_valid = ATTR_MODE,};
1412         struct kernfs_node *kn;
1413         int ret = 0;
1414
1415         kn = kernfs_find_and_get_ns(r->kn, name, NULL);
1416         if (!kn)
1417                 return -ENOENT;
1418
1419         switch (kernfs_type(kn)) {
1420         case KERNFS_DIR:
1421                 iattr.ia_mode = S_IFDIR;
1422                 break;
1423         case KERNFS_FILE:
1424                 iattr.ia_mode = S_IFREG;
1425                 break;
1426         case KERNFS_LINK:
1427                 iattr.ia_mode = S_IFLNK;
1428                 break;
1429         }
1430
1431         ret = kernfs_setattr(kn, &iattr);
1432         kernfs_put(kn);
1433         return ret;
1434 }
1435
1436 /**
1437  * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
1438  * @r: The resource group with which the file is associated.
1439  * @name: Name of the file
1440  * @mask: Mask of permissions that should be restored
1441  *
1442  * Restore the permissions of the named file. If @name is a directory the
1443  * permissions of its parent will be used.
1444  *
1445  * Return: 0 on success, <0 on failure.
1446  */
1447 int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
1448                              umode_t mask)
1449 {
1450         struct iattr iattr = {.ia_valid = ATTR_MODE,};
1451         struct kernfs_node *kn, *parent;
1452         struct rftype *rfts, *rft;
1453         int ret, len;
1454
1455         rfts = res_common_files;
1456         len = ARRAY_SIZE(res_common_files);
1457
1458         for (rft = rfts; rft < rfts + len; rft++) {
1459                 if (!strcmp(rft->name, name))
1460                         iattr.ia_mode = rft->mode & mask;
1461         }
1462
1463         kn = kernfs_find_and_get_ns(r->kn, name, NULL);
1464         if (!kn)
1465                 return -ENOENT;
1466
1467         switch (kernfs_type(kn)) {
1468         case KERNFS_DIR:
1469                 parent = kernfs_get_parent(kn);
1470                 if (parent) {
1471                         iattr.ia_mode |= parent->mode;
1472                         kernfs_put(parent);
1473                 }
1474                 iattr.ia_mode |= S_IFDIR;
1475                 break;
1476         case KERNFS_FILE:
1477                 iattr.ia_mode |= S_IFREG;
1478                 break;
1479         case KERNFS_LINK:
1480                 iattr.ia_mode |= S_IFLNK;
1481                 break;
1482         }
1483
1484         ret = kernfs_setattr(kn, &iattr);
1485         kernfs_put(kn);
1486         return ret;
1487 }
1488
1489 static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
1490                                       unsigned long fflags)
1491 {
1492         struct kernfs_node *kn_subdir;
1493         int ret;
1494
1495         kn_subdir = kernfs_create_dir(kn_info, name,
1496                                       kn_info->mode, r);
1497         if (IS_ERR(kn_subdir))
1498                 return PTR_ERR(kn_subdir);
1499
1500         kernfs_get(kn_subdir);
1501         ret = rdtgroup_kn_set_ugid(kn_subdir);
1502         if (ret)
1503                 return ret;
1504
1505         ret = rdtgroup_add_files(kn_subdir, fflags);
1506         if (!ret)
1507                 kernfs_activate(kn_subdir);
1508
1509         return ret;
1510 }
1511
1512 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
1513 {
1514         struct rdt_resource *r;
1515         unsigned long fflags;
1516         char name[32];
1517         int ret;
1518
1519         /* create the directory */
1520         kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
1521         if (IS_ERR(kn_info))
1522                 return PTR_ERR(kn_info);
1523         kernfs_get(kn_info);
1524
1525         ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
1526         if (ret)
1527                 goto out_destroy;
1528
1529         for_each_alloc_enabled_rdt_resource(r) {
1530                 fflags =  r->fflags | RF_CTRL_INFO;
1531                 ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
1532                 if (ret)
1533                         goto out_destroy;
1534         }
1535
1536         for_each_mon_enabled_rdt_resource(r) {
1537                 fflags =  r->fflags | RF_MON_INFO;
1538                 sprintf(name, "%s_MON", r->name);
1539                 ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
1540                 if (ret)
1541                         goto out_destroy;
1542         }
1543
1544         /*
1545          * This extra ref will be put in kernfs_remove() and guarantees
1546          * that @rdtgrp->kn is always accessible.
1547          */
1548         kernfs_get(kn_info);
1549
1550         ret = rdtgroup_kn_set_ugid(kn_info);
1551         if (ret)
1552                 goto out_destroy;
1553
1554         kernfs_activate(kn_info);
1555
1556         return 0;
1557
1558 out_destroy:
1559         kernfs_remove(kn_info);
1560         return ret;
1561 }
1562
1563 static int
1564 mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
1565                     char *name, struct kernfs_node **dest_kn)
1566 {
1567         struct kernfs_node *kn;
1568         int ret;
1569
1570         /* create the directory */
1571         kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
1572         if (IS_ERR(kn))
1573                 return PTR_ERR(kn);
1574
1575         if (dest_kn)
1576                 *dest_kn = kn;
1577
1578         /*
1579          * This extra ref will be put in kernfs_remove() and guarantees
1580          * that @rdtgrp->kn is always accessible.
1581          */
1582         kernfs_get(kn);
1583
1584         ret = rdtgroup_kn_set_ugid(kn);
1585         if (ret)
1586                 goto out_destroy;
1587
1588         kernfs_activate(kn);
1589
1590         return 0;
1591
1592 out_destroy:
1593         kernfs_remove(kn);
1594         return ret;
1595 }
1596
1597 static void l3_qos_cfg_update(void *arg)
1598 {
1599         bool *enable = arg;
1600
1601         wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
1602 }
1603
1604 static void l2_qos_cfg_update(void *arg)
1605 {
1606         bool *enable = arg;
1607
1608         wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
1609 }
1610
1611 static inline bool is_mba_linear(void)
1612 {
1613         return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
1614 }
1615
1616 static int set_cache_qos_cfg(int level, bool enable)
1617 {
1618         void (*update)(void *arg);
1619         struct rdt_resource *r_l;
1620         cpumask_var_t cpu_mask;
1621         struct rdt_domain *d;
1622         int cpu;
1623
1624         if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
1625                 return -ENOMEM;
1626
1627         if (level == RDT_RESOURCE_L3)
1628                 update = l3_qos_cfg_update;
1629         else if (level == RDT_RESOURCE_L2)
1630                 update = l2_qos_cfg_update;
1631         else
1632                 return -EINVAL;
1633
1634         r_l = &rdt_resources_all[level];
1635         list_for_each_entry(d, &r_l->domains, list) {
1636                 /* Pick one CPU from each domain instance to update MSR */
1637                 cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
1638         }
1639         cpu = get_cpu();
1640         /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
1641         if (cpumask_test_cpu(cpu, cpu_mask))
1642                 update(&enable);
1643         /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
1644         smp_call_function_many(cpu_mask, update, &enable, 1);
1645         put_cpu();
1646
1647         free_cpumask_var(cpu_mask);
1648
1649         return 0;
1650 }
1651
1652 /*
1653  * Enable or disable the MBA software controller
1654  * which helps user specify bandwidth in MBps.
1655  * MBA software controller is supported only if
1656  * MBM is supported and MBA is in linear scale.
1657  */
1658 static int set_mba_sc(bool mba_sc)
1659 {
1660         struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
1661         struct rdt_domain *d;
1662
1663         if (!is_mbm_enabled() || !is_mba_linear() ||
1664             mba_sc == is_mba_sc(r))
1665                 return -EINVAL;
1666
1667         r->membw.mba_sc = mba_sc;
1668         list_for_each_entry(d, &r->domains, list)
1669                 setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
1670
1671         return 0;
1672 }
1673
1674 static int cdp_enable(int level, int data_type, int code_type)
1675 {
1676         struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
1677         struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
1678         struct rdt_resource *r_l = &rdt_resources_all[level];
1679         int ret;
1680
1681         if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
1682             !r_lcode->alloc_capable)
1683                 return -EINVAL;
1684
1685         ret = set_cache_qos_cfg(level, true);
1686         if (!ret) {
1687                 r_l->alloc_enabled = false;
1688                 r_ldata->alloc_enabled = true;
1689                 r_lcode->alloc_enabled = true;
1690         }
1691         return ret;
1692 }
1693
1694 static int cdpl3_enable(void)
1695 {
1696         return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
1697                           RDT_RESOURCE_L3CODE);
1698 }
1699
1700 static int cdpl2_enable(void)
1701 {
1702         return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
1703                           RDT_RESOURCE_L2CODE);
1704 }
1705
1706 static void cdp_disable(int level, int data_type, int code_type)
1707 {
1708         struct rdt_resource *r = &rdt_resources_all[level];
1709
1710         r->alloc_enabled = r->alloc_capable;
1711
1712         if (rdt_resources_all[data_type].alloc_enabled) {
1713                 rdt_resources_all[data_type].alloc_enabled = false;
1714                 rdt_resources_all[code_type].alloc_enabled = false;
1715                 set_cache_qos_cfg(level, false);
1716         }
1717 }
1718
1719 static void cdpl3_disable(void)
1720 {
1721         cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
1722 }
1723
1724 static void cdpl2_disable(void)
1725 {
1726         cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
1727 }
1728
1729 static void cdp_disable_all(void)
1730 {
1731         if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
1732                 cdpl3_disable();
1733         if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
1734                 cdpl2_disable();
1735 }
1736
1737 static int parse_rdtgroupfs_options(char *data)
1738 {
1739         char *token, *o = data;
1740         int ret = 0;
1741
1742         while ((token = strsep(&o, ",")) != NULL) {
1743                 if (!*token) {
1744                         ret = -EINVAL;
1745                         goto out;
1746                 }
1747
1748                 if (!strcmp(token, "cdp")) {
1749                         ret = cdpl3_enable();
1750                         if (ret)
1751                                 goto out;
1752                 } else if (!strcmp(token, "cdpl2")) {
1753                         ret = cdpl2_enable();
1754                         if (ret)
1755                                 goto out;
1756                 } else if (!strcmp(token, "mba_MBps")) {
1757                         ret = set_mba_sc(true);
1758                         if (ret)
1759                                 goto out;
1760                 } else {
1761                         ret = -EINVAL;
1762                         goto out;
1763                 }
1764         }
1765
1766         return 0;
1767
1768 out:
1769         pr_err("Invalid mount option \"%s\"\n", token);
1770
1771         return ret;
1772 }
1773
1774 /*
1775  * We don't allow rdtgroup directories to be created anywhere
1776  * except the root directory. Thus when looking for the rdtgroup
1777  * structure for a kernfs node we are either looking at a directory,
1778  * in which case the rdtgroup structure is pointed at by the "priv"
1779  * field, otherwise we have a file, and need only look to the parent
1780  * to find the rdtgroup.
1781  */
1782 static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
1783 {
1784         if (kernfs_type(kn) == KERNFS_DIR) {
1785                 /*
1786                  * All the resource directories use "kn->priv"
1787                  * to point to the "struct rdtgroup" for the
1788                  * resource. "info" and its subdirectories don't
1789                  * have rdtgroup structures, so return NULL here.
1790                  */
1791                 if (kn == kn_info || kn->parent == kn_info)
1792                         return NULL;
1793                 else
1794                         return kn->priv;
1795         } else {
1796                 return kn->parent->priv;
1797         }
1798 }
1799
1800 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
1801 {
1802         struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
1803
1804         if (!rdtgrp)
1805                 return NULL;
1806
1807         atomic_inc(&rdtgrp->waitcount);
1808         kernfs_break_active_protection(kn);
1809
1810         mutex_lock(&rdtgroup_mutex);
1811
1812         /* Was this group deleted while we waited? */
1813         if (rdtgrp->flags & RDT_DELETED)
1814                 return NULL;
1815
1816         return rdtgrp;
1817 }
1818
1819 void rdtgroup_kn_unlock(struct kernfs_node *kn)
1820 {
1821         struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
1822
1823         if (!rdtgrp)
1824                 return;
1825
1826         mutex_unlock(&rdtgroup_mutex);
1827
1828         if (atomic_dec_and_test(&rdtgrp->waitcount) &&
1829             (rdtgrp->flags & RDT_DELETED)) {
1830                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
1831                     rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
1832                         rdtgroup_pseudo_lock_remove(rdtgrp);
1833                 kernfs_unbreak_active_protection(kn);
1834                 kernfs_put(rdtgrp->kn);
1835                 kfree(rdtgrp);
1836         } else {
1837                 kernfs_unbreak_active_protection(kn);
1838         }
1839 }
1840
1841 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
1842                              struct rdtgroup *prgrp,
1843                              struct kernfs_node **mon_data_kn);
1844
1845 static struct dentry *rdt_mount(struct file_system_type *fs_type,
1846                                 int flags, const char *unused_dev_name,
1847                                 void *data)
1848 {
1849         struct rdt_domain *dom;
1850         struct rdt_resource *r;
1851         struct dentry *dentry;
1852         int ret;
1853
1854         cpus_read_lock();
1855         mutex_lock(&rdtgroup_mutex);
1856         /*
1857          * resctrl file system can only be mounted once.
1858          */
1859         if (static_branch_unlikely(&rdt_enable_key)) {
1860                 dentry = ERR_PTR(-EBUSY);
1861                 goto out;
1862         }
1863
1864         ret = parse_rdtgroupfs_options(data);
1865         if (ret) {
1866                 dentry = ERR_PTR(ret);
1867                 goto out_cdp;
1868         }
1869
1870         closid_init();
1871
1872         ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
1873         if (ret) {
1874                 dentry = ERR_PTR(ret);
1875                 goto out_cdp;
1876         }
1877
1878         if (rdt_mon_capable) {
1879                 ret = mongroup_create_dir(rdtgroup_default.kn,
1880                                           NULL, "mon_groups",
1881                                           &kn_mongrp);
1882                 if (ret) {
1883                         dentry = ERR_PTR(ret);
1884                         goto out_info;
1885                 }
1886                 kernfs_get(kn_mongrp);
1887
1888                 ret = mkdir_mondata_all(rdtgroup_default.kn,
1889                                         &rdtgroup_default, &kn_mondata);
1890                 if (ret) {
1891                         dentry = ERR_PTR(ret);
1892                         goto out_mongrp;
1893                 }
1894                 kernfs_get(kn_mondata);
1895                 rdtgroup_default.mon.mon_data_kn = kn_mondata;
1896         }
1897
1898         ret = rdt_pseudo_lock_init();
1899         if (ret) {
1900                 dentry = ERR_PTR(ret);
1901                 goto out_mondata;
1902         }
1903
1904         dentry = kernfs_mount(fs_type, flags, rdt_root,
1905                               RDTGROUP_SUPER_MAGIC, NULL);
1906         if (IS_ERR(dentry))
1907                 goto out_psl;
1908
1909         if (rdt_alloc_capable)
1910                 static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
1911         if (rdt_mon_capable)
1912                 static_branch_enable_cpuslocked(&rdt_mon_enable_key);
1913
1914         if (rdt_alloc_capable || rdt_mon_capable)
1915                 static_branch_enable_cpuslocked(&rdt_enable_key);
1916
1917         if (is_mbm_enabled()) {
1918                 r = &rdt_resources_all[RDT_RESOURCE_L3];
1919                 list_for_each_entry(dom, &r->domains, list)
1920                         mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
1921         }
1922
1923         goto out;
1924
1925 out_psl:
1926         rdt_pseudo_lock_release();
1927 out_mondata:
1928         if (rdt_mon_capable)
1929                 kernfs_remove(kn_mondata);
1930 out_mongrp:
1931         if (rdt_mon_capable)
1932                 kernfs_remove(kn_mongrp);
1933 out_info:
1934         kernfs_remove(kn_info);
1935 out_cdp:
1936         cdp_disable_all();
1937 out:
1938         rdt_last_cmd_clear();
1939         mutex_unlock(&rdtgroup_mutex);
1940         cpus_read_unlock();
1941
1942         return dentry;
1943 }
1944
1945 static int reset_all_ctrls(struct rdt_resource *r)
1946 {
1947         struct msr_param msr_param;
1948         cpumask_var_t cpu_mask;
1949         struct rdt_domain *d;
1950         int i, cpu;
1951
1952         if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
1953                 return -ENOMEM;
1954
1955         msr_param.res = r;
1956         msr_param.low = 0;
1957         msr_param.high = r->num_closid;
1958
1959         /*
1960          * Disable resource control for this resource by setting all
1961          * CBMs in all domains to the maximum mask value. Pick one CPU
1962          * from each domain to update the MSRs below.
1963          */
1964         list_for_each_entry(d, &r->domains, list) {
1965                 cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
1966
1967                 for (i = 0; i < r->num_closid; i++)
1968                         d->ctrl_val[i] = r->default_ctrl;
1969         }
1970         cpu = get_cpu();
1971         /* Update CBM on this cpu if it's in cpu_mask. */
1972         if (cpumask_test_cpu(cpu, cpu_mask))
1973                 rdt_ctrl_update(&msr_param);
1974         /* Update CBM on all other cpus in cpu_mask. */
1975         smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
1976         put_cpu();
1977
1978         free_cpumask_var(cpu_mask);
1979
1980         return 0;
1981 }
1982
1983 static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
1984 {
1985         return (rdt_alloc_capable &&
1986                 (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
1987 }
1988
1989 static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
1990 {
1991         return (rdt_mon_capable &&
1992                 (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
1993 }
1994
1995 /*
1996  * Move tasks from one to the other group. If @from is NULL, then all tasks
1997  * in the systems are moved unconditionally (used for teardown).
1998  *
1999  * If @mask is not NULL the cpus on which moved tasks are running are set
2000  * in that mask so the update smp function call is restricted to affected
2001  * cpus.
2002  */
2003 static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
2004                                  struct cpumask *mask)
2005 {
2006         struct task_struct *p, *t;
2007
2008         read_lock(&tasklist_lock);
2009         for_each_process_thread(p, t) {
2010                 if (!from || is_closid_match(t, from) ||
2011                     is_rmid_match(t, from)) {
2012                         t->closid = to->closid;
2013                         t->rmid = to->mon.rmid;
2014
2015 #ifdef CONFIG_SMP
2016                         /*
2017                          * This is safe on x86 w/o barriers as the ordering
2018                          * of writing to task_cpu() and t->on_cpu is
2019                          * reverse to the reading here. The detection is
2020                          * inaccurate as tasks might move or schedule
2021                          * before the smp function call takes place. In
2022                          * such a case the function call is pointless, but
2023                          * there is no other side effect.
2024                          */
2025                         if (mask && t->on_cpu)
2026                                 cpumask_set_cpu(task_cpu(t), mask);
2027 #endif
2028                 }
2029         }
2030         read_unlock(&tasklist_lock);
2031 }
2032
2033 static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
2034 {
2035         struct rdtgroup *sentry, *stmp;
2036         struct list_head *head;
2037
2038         head = &rdtgrp->mon.crdtgrp_list;
2039         list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
2040                 free_rmid(sentry->mon.rmid);
2041                 list_del(&sentry->mon.crdtgrp_list);
2042                 kfree(sentry);
2043         }
2044 }
2045
2046 /*
2047  * Forcibly remove all of subdirectories under root.
2048  */
2049 static void rmdir_all_sub(void)
2050 {
2051         struct rdtgroup *rdtgrp, *tmp;
2052
2053         /* Move all tasks to the default resource group */
2054         rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
2055
2056         list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
2057                 /* Free any child rmids */
2058                 free_all_child_rdtgrp(rdtgrp);
2059
2060                 /* Remove each rdtgroup other than root */
2061                 if (rdtgrp == &rdtgroup_default)
2062                         continue;
2063
2064                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2065                     rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2066                         rdtgroup_pseudo_lock_remove(rdtgrp);
2067
2068                 /*
2069                  * Give any CPUs back to the default group. We cannot copy
2070                  * cpu_online_mask because a CPU might have executed the
2071                  * offline callback already, but is still marked online.
2072                  */
2073                 cpumask_or(&rdtgroup_default.cpu_mask,
2074                            &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2075
2076                 free_rmid(rdtgrp->mon.rmid);
2077
2078                 kernfs_remove(rdtgrp->kn);
2079                 list_del(&rdtgrp->rdtgroup_list);
2080                 kfree(rdtgrp);
2081         }
2082         /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
2083         update_closid_rmid(cpu_online_mask, &rdtgroup_default);
2084
2085         kernfs_remove(kn_info);
2086         kernfs_remove(kn_mongrp);
2087         kernfs_remove(kn_mondata);
2088 }
2089
2090 static void rdt_kill_sb(struct super_block *sb)
2091 {
2092         struct rdt_resource *r;
2093
2094         cpus_read_lock();
2095         mutex_lock(&rdtgroup_mutex);
2096
2097         set_mba_sc(false);
2098
2099         /*Put everything back to default values. */
2100         for_each_alloc_enabled_rdt_resource(r)
2101                 reset_all_ctrls(r);
2102         cdp_disable_all();
2103         rmdir_all_sub();
2104         rdt_pseudo_lock_release();
2105         rdtgroup_default.mode = RDT_MODE_SHAREABLE;
2106         static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
2107         static_branch_disable_cpuslocked(&rdt_mon_enable_key);
2108         static_branch_disable_cpuslocked(&rdt_enable_key);
2109         kernfs_kill_sb(sb);
2110         mutex_unlock(&rdtgroup_mutex);
2111         cpus_read_unlock();
2112 }
2113
2114 static struct file_system_type rdt_fs_type = {
2115         .name    = "resctrl",
2116         .mount   = rdt_mount,
2117         .kill_sb = rdt_kill_sb,
2118 };
2119
2120 static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
2121                        void *priv)
2122 {
2123         struct kernfs_node *kn;
2124         int ret = 0;
2125
2126         kn = __kernfs_create_file(parent_kn, name, 0444,
2127                                   GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
2128                                   &kf_mondata_ops, priv, NULL, NULL);
2129         if (IS_ERR(kn))
2130                 return PTR_ERR(kn);
2131
2132         ret = rdtgroup_kn_set_ugid(kn);
2133         if (ret) {
2134                 kernfs_remove(kn);
2135                 return ret;
2136         }
2137
2138         return ret;
2139 }
2140
2141 /*
2142  * Remove all subdirectories of mon_data of ctrl_mon groups
2143  * and monitor groups with given domain id.
2144  */
2145 void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
2146 {
2147         struct rdtgroup *prgrp, *crgrp;
2148         char name[32];
2149
2150         if (!r->mon_enabled)
2151                 return;
2152
2153         list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
2154                 sprintf(name, "mon_%s_%02d", r->name, dom_id);
2155                 kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
2156
2157                 list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
2158                         kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
2159         }
2160 }
2161
2162 static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
2163                                 struct rdt_domain *d,
2164                                 struct rdt_resource *r, struct rdtgroup *prgrp)
2165 {
2166         union mon_data_bits priv;
2167         struct kernfs_node *kn;
2168         struct mon_evt *mevt;
2169         struct rmid_read rr;
2170         char name[32];
2171         int ret;
2172
2173         sprintf(name, "mon_%s_%02d", r->name, d->id);
2174         /* create the directory */
2175         kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
2176         if (IS_ERR(kn))
2177                 return PTR_ERR(kn);
2178
2179         /*
2180          * This extra ref will be put in kernfs_remove() and guarantees
2181          * that kn is always accessible.
2182          */
2183         kernfs_get(kn);
2184         ret = rdtgroup_kn_set_ugid(kn);
2185         if (ret)
2186                 goto out_destroy;
2187
2188         if (WARN_ON(list_empty(&r->evt_list))) {
2189                 ret = -EPERM;
2190                 goto out_destroy;
2191         }
2192
2193         priv.u.rid = r->rid;
2194         priv.u.domid = d->id;
2195         list_for_each_entry(mevt, &r->evt_list, list) {
2196                 priv.u.evtid = mevt->evtid;
2197                 ret = mon_addfile(kn, mevt->name, priv.priv);
2198                 if (ret)
2199                         goto out_destroy;
2200
2201                 if (is_mbm_event(mevt->evtid))
2202                         mon_event_read(&rr, d, prgrp, mevt->evtid, true);
2203         }
2204         kernfs_activate(kn);
2205         return 0;
2206
2207 out_destroy:
2208         kernfs_remove(kn);
2209         return ret;
2210 }
2211
2212 /*
2213  * Add all subdirectories of mon_data for "ctrl_mon" groups
2214  * and "monitor" groups with given domain id.
2215  */
2216 void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
2217                                     struct rdt_domain *d)
2218 {
2219         struct kernfs_node *parent_kn;
2220         struct rdtgroup *prgrp, *crgrp;
2221         struct list_head *head;
2222
2223         if (!r->mon_enabled)
2224                 return;
2225
2226         list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
2227                 parent_kn = prgrp->mon.mon_data_kn;
2228                 mkdir_mondata_subdir(parent_kn, d, r, prgrp);
2229
2230                 head = &prgrp->mon.crdtgrp_list;
2231                 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
2232                         parent_kn = crgrp->mon.mon_data_kn;
2233                         mkdir_mondata_subdir(parent_kn, d, r, crgrp);
2234                 }
2235         }
2236 }
2237
2238 static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
2239                                        struct rdt_resource *r,
2240                                        struct rdtgroup *prgrp)
2241 {
2242         struct rdt_domain *dom;
2243         int ret;
2244
2245         list_for_each_entry(dom, &r->domains, list) {
2246                 ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
2247                 if (ret)
2248                         return ret;
2249         }
2250
2251         return 0;
2252 }
2253
2254 /*
2255  * This creates a directory mon_data which contains the monitored data.
2256  *
2257  * mon_data has one directory for each domain whic are named
2258  * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
2259  * with L3 domain looks as below:
2260  * ./mon_data:
2261  * mon_L3_00
2262  * mon_L3_01
2263  * mon_L3_02
2264  * ...
2265  *
2266  * Each domain directory has one file per event:
2267  * ./mon_L3_00/:
2268  * llc_occupancy
2269  *
2270  */
2271 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2272                              struct rdtgroup *prgrp,
2273                              struct kernfs_node **dest_kn)
2274 {
2275         struct rdt_resource *r;
2276         struct kernfs_node *kn;
2277         int ret;
2278
2279         /*
2280          * Create the mon_data directory first.
2281          */
2282         ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
2283         if (ret)
2284                 return ret;
2285
2286         if (dest_kn)
2287                 *dest_kn = kn;
2288
2289         /*
2290          * Create the subdirectories for each domain. Note that all events
2291          * in a domain like L3 are grouped into a resource whose domain is L3
2292          */
2293         for_each_mon_enabled_rdt_resource(r) {
2294                 ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
2295                 if (ret)
2296                         goto out_destroy;
2297         }
2298
2299         return 0;
2300
2301 out_destroy:
2302         kernfs_remove(kn);
2303         return ret;
2304 }
2305
2306 /**
2307  * cbm_ensure_valid - Enforce validity on provided CBM
2308  * @_val:       Candidate CBM
2309  * @r:          RDT resource to which the CBM belongs
2310  *
2311  * The provided CBM represents all cache portions available for use. This
2312  * may be represented by a bitmap that does not consist of contiguous ones
2313  * and thus be an invalid CBM.
2314  * Here the provided CBM is forced to be a valid CBM by only considering
2315  * the first set of contiguous bits as valid and clearing all bits.
2316  * The intention here is to provide a valid default CBM with which a new
2317  * resource group is initialized. The user can follow this with a
2318  * modification to the CBM if the default does not satisfy the
2319  * requirements.
2320  */
2321 static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
2322 {
2323         /*
2324          * Convert the u32 _val to an unsigned long required by all the bit
2325          * operations within this function. No more than 32 bits of this
2326          * converted value can be accessed because all bit operations are
2327          * additionally provided with cbm_len that is initialized during
2328          * hardware enumeration using five bits from the EAX register and
2329          * thus never can exceed 32 bits.
2330          */
2331         unsigned long *val = (unsigned long *)_val;
2332         unsigned int cbm_len = r->cache.cbm_len;
2333         unsigned long first_bit, zero_bit;
2334
2335         if (*val == 0)
2336                 return;
2337
2338         first_bit = find_first_bit(val, cbm_len);
2339         zero_bit = find_next_zero_bit(val, cbm_len, first_bit);
2340
2341         /* Clear any remaining bits to ensure contiguous region */
2342         bitmap_clear(val, zero_bit, cbm_len - zero_bit);
2343 }
2344
2345 /**
2346  * rdtgroup_init_alloc - Initialize the new RDT group's allocations
2347  *
2348  * A new RDT group is being created on an allocation capable (CAT)
2349  * supporting system. Set this group up to start off with all usable
2350  * allocations. That is, all shareable and unused bits.
2351  *
2352  * All-zero CBM is invalid. If there are no more shareable bits available
2353  * on any domain then the entire allocation will fail.
2354  */
2355 static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
2356 {
2357         u32 used_b = 0, unused_b = 0;
2358         u32 closid = rdtgrp->closid;
2359         struct rdt_resource *r;
2360         unsigned long tmp_cbm;
2361         enum rdtgrp_mode mode;
2362         struct rdt_domain *d;
2363         int i, ret;
2364         u32 *ctrl;
2365
2366         for_each_alloc_enabled_rdt_resource(r) {
2367                 /*
2368                  * Only initialize default allocations for CBM cache
2369                  * resources
2370                  */
2371                 if (r->rid == RDT_RESOURCE_MBA)
2372                         continue;
2373                 list_for_each_entry(d, &r->domains, list) {
2374                         d->have_new_ctrl = false;
2375                         d->new_ctrl = r->cache.shareable_bits;
2376                         used_b = r->cache.shareable_bits;
2377                         ctrl = d->ctrl_val;
2378                         for (i = 0; i < closids_supported(); i++, ctrl++) {
2379                                 if (closid_allocated(i) && i != closid) {
2380                                         mode = rdtgroup_mode_by_closid(i);
2381                                         if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
2382                                                 break;
2383                                         used_b |= *ctrl;
2384                                         if (mode == RDT_MODE_SHAREABLE)
2385                                                 d->new_ctrl |= *ctrl;
2386                                 }
2387                         }
2388                         if (d->plr && d->plr->cbm > 0)
2389                                 used_b |= d->plr->cbm;
2390                         unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
2391                         unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
2392                         d->new_ctrl |= unused_b;
2393                         /*
2394                          * Force the initial CBM to be valid, user can
2395                          * modify the CBM based on system availability.
2396                          */
2397                         cbm_ensure_valid(&d->new_ctrl, r);
2398                         /*
2399                          * Assign the u32 CBM to an unsigned long to ensure
2400                          * that bitmap_weight() does not access out-of-bound
2401                          * memory.
2402                          */
2403                         tmp_cbm = d->new_ctrl;
2404                         if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) <
2405                             r->cache.min_cbm_bits) {
2406                                 rdt_last_cmd_printf("no space on %s:%d\n",
2407                                                     r->name, d->id);
2408                                 return -ENOSPC;
2409                         }
2410                         d->have_new_ctrl = true;
2411                 }
2412         }
2413
2414         for_each_alloc_enabled_rdt_resource(r) {
2415                 /*
2416                  * Only initialize default allocations for CBM cache
2417                  * resources
2418                  */
2419                 if (r->rid == RDT_RESOURCE_MBA)
2420                         continue;
2421                 ret = update_domains(r, rdtgrp->closid);
2422                 if (ret < 0) {
2423                         rdt_last_cmd_puts("failed to initialize allocations\n");
2424                         return ret;
2425                 }
2426                 rdtgrp->mode = RDT_MODE_SHAREABLE;
2427         }
2428
2429         return 0;
2430 }
2431
2432 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
2433                              struct kernfs_node *prgrp_kn,
2434                              const char *name, umode_t mode,
2435                              enum rdt_group_type rtype, struct rdtgroup **r)
2436 {
2437         struct rdtgroup *prdtgrp, *rdtgrp;
2438         struct kernfs_node *kn;
2439         uint files = 0;
2440         int ret;
2441
2442         prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
2443         rdt_last_cmd_clear();
2444         if (!prdtgrp) {
2445                 ret = -ENODEV;
2446                 rdt_last_cmd_puts("directory was removed\n");
2447                 goto out_unlock;
2448         }
2449
2450         if (rtype == RDTMON_GROUP &&
2451             (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2452              prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
2453                 ret = -EINVAL;
2454                 rdt_last_cmd_puts("pseudo-locking in progress\n");
2455                 goto out_unlock;
2456         }
2457
2458         /* allocate the rdtgroup. */
2459         rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
2460         if (!rdtgrp) {
2461                 ret = -ENOSPC;
2462                 rdt_last_cmd_puts("kernel out of memory\n");
2463                 goto out_unlock;
2464         }
2465         *r = rdtgrp;
2466         rdtgrp->mon.parent = prdtgrp;
2467         rdtgrp->type = rtype;
2468         INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
2469
2470         /* kernfs creates the directory for rdtgrp */
2471         kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
2472         if (IS_ERR(kn)) {
2473                 ret = PTR_ERR(kn);
2474                 rdt_last_cmd_puts("kernfs create error\n");
2475                 goto out_free_rgrp;
2476         }
2477         rdtgrp->kn = kn;
2478
2479         /*
2480          * kernfs_remove() will drop the reference count on "kn" which
2481          * will free it. But we still need it to stick around for the
2482          * rdtgroup_kn_unlock(kn} call below. Take one extra reference
2483          * here, which will be dropped inside rdtgroup_kn_unlock().
2484          */
2485         kernfs_get(kn);
2486
2487         ret = rdtgroup_kn_set_ugid(kn);
2488         if (ret) {
2489                 rdt_last_cmd_puts("kernfs perm error\n");
2490                 goto out_destroy;
2491         }
2492
2493         files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
2494         ret = rdtgroup_add_files(kn, files);
2495         if (ret) {
2496                 rdt_last_cmd_puts("kernfs fill error\n");
2497                 goto out_destroy;
2498         }
2499
2500         if (rdt_mon_capable) {
2501                 ret = alloc_rmid();
2502                 if (ret < 0) {
2503                         rdt_last_cmd_puts("out of RMIDs\n");
2504                         goto out_destroy;
2505                 }
2506                 rdtgrp->mon.rmid = ret;
2507
2508                 ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
2509                 if (ret) {
2510                         rdt_last_cmd_puts("kernfs subdir error\n");
2511                         goto out_idfree;
2512                 }
2513         }
2514         kernfs_activate(kn);
2515
2516         /*
2517          * The caller unlocks the prgrp_kn upon success.
2518          */
2519         return 0;
2520
2521 out_idfree:
2522         free_rmid(rdtgrp->mon.rmid);
2523 out_destroy:
2524         kernfs_remove(rdtgrp->kn);
2525 out_free_rgrp:
2526         kfree(rdtgrp);
2527 out_unlock:
2528         rdtgroup_kn_unlock(prgrp_kn);
2529         return ret;
2530 }
2531
2532 static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
2533 {
2534         kernfs_remove(rgrp->kn);
2535         free_rmid(rgrp->mon.rmid);
2536         kfree(rgrp);
2537 }
2538
2539 /*
2540  * Create a monitor group under "mon_groups" directory of a control
2541  * and monitor group(ctrl_mon). This is a resource group
2542  * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
2543  */
2544 static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
2545                               struct kernfs_node *prgrp_kn,
2546                               const char *name,
2547                               umode_t mode)
2548 {
2549         struct rdtgroup *rdtgrp, *prgrp;
2550         int ret;
2551
2552         ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
2553                                 &rdtgrp);
2554         if (ret)
2555                 return ret;
2556
2557         prgrp = rdtgrp->mon.parent;
2558         rdtgrp->closid = prgrp->closid;
2559
2560         /*
2561          * Add the rdtgrp to the list of rdtgrps the parent
2562          * ctrl_mon group has to track.
2563          */
2564         list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
2565
2566         rdtgroup_kn_unlock(prgrp_kn);
2567         return ret;
2568 }
2569
2570 /*
2571  * These are rdtgroups created under the root directory. Can be used
2572  * to allocate and monitor resources.
2573  */
2574 static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
2575                                    struct kernfs_node *prgrp_kn,
2576                                    const char *name, umode_t mode)
2577 {
2578         struct rdtgroup *rdtgrp;
2579         struct kernfs_node *kn;
2580         u32 closid;
2581         int ret;
2582
2583         ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
2584                                 &rdtgrp);
2585         if (ret)
2586                 return ret;
2587
2588         kn = rdtgrp->kn;
2589         ret = closid_alloc();
2590         if (ret < 0) {
2591                 rdt_last_cmd_puts("out of CLOSIDs\n");
2592                 goto out_common_fail;
2593         }
2594         closid = ret;
2595         ret = 0;
2596
2597         rdtgrp->closid = closid;
2598         ret = rdtgroup_init_alloc(rdtgrp);
2599         if (ret < 0)
2600                 goto out_id_free;
2601
2602         list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
2603
2604         if (rdt_mon_capable) {
2605                 /*
2606                  * Create an empty mon_groups directory to hold the subset
2607                  * of tasks and cpus to monitor.
2608                  */
2609                 ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
2610                 if (ret) {
2611                         rdt_last_cmd_puts("kernfs subdir error\n");
2612                         goto out_del_list;
2613                 }
2614         }
2615
2616         goto out_unlock;
2617
2618 out_del_list:
2619         list_del(&rdtgrp->rdtgroup_list);
2620 out_id_free:
2621         closid_free(closid);
2622 out_common_fail:
2623         mkdir_rdt_prepare_clean(rdtgrp);
2624 out_unlock:
2625         rdtgroup_kn_unlock(prgrp_kn);
2626         return ret;
2627 }
2628
2629 /*
2630  * We allow creating mon groups only with in a directory called "mon_groups"
2631  * which is present in every ctrl_mon group. Check if this is a valid
2632  * "mon_groups" directory.
2633  *
2634  * 1. The directory should be named "mon_groups".
2635  * 2. The mon group itself should "not" be named "mon_groups".
2636  *   This makes sure "mon_groups" directory always has a ctrl_mon group
2637  *   as parent.
2638  */
2639 static bool is_mon_groups(struct kernfs_node *kn, const char *name)
2640 {
2641         return (!strcmp(kn->name, "mon_groups") &&
2642                 strcmp(name, "mon_groups"));
2643 }
2644
2645 static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
2646                           umode_t mode)
2647 {
2648         /* Do not accept '\n' to avoid unparsable situation. */
2649         if (strchr(name, '\n'))
2650                 return -EINVAL;
2651
2652         /*
2653          * If the parent directory is the root directory and RDT
2654          * allocation is supported, add a control and monitoring
2655          * subdirectory
2656          */
2657         if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
2658                 return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
2659
2660         /*
2661          * If RDT monitoring is supported and the parent directory is a valid
2662          * "mon_groups" directory, add a monitoring subdirectory.
2663          */
2664         if (rdt_mon_capable && is_mon_groups(parent_kn, name))
2665                 return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
2666
2667         return -EPERM;
2668 }
2669
2670 static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
2671                               cpumask_var_t tmpmask)
2672 {
2673         struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
2674         int cpu;
2675
2676         /* Give any tasks back to the parent group */
2677         rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
2678
2679         /* Update per cpu rmid of the moved CPUs first */
2680         for_each_cpu(cpu, &rdtgrp->cpu_mask)
2681                 per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
2682         /*
2683          * Update the MSR on moved CPUs and CPUs which have moved
2684          * task running on them.
2685          */
2686         cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
2687         update_closid_rmid(tmpmask, NULL);
2688
2689         rdtgrp->flags = RDT_DELETED;
2690         free_rmid(rdtgrp->mon.rmid);
2691
2692         /*
2693          * Remove the rdtgrp from the parent ctrl_mon group's list
2694          */
2695         WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
2696         list_del(&rdtgrp->mon.crdtgrp_list);
2697
2698         /*
2699          * one extra hold on this, will drop when we kfree(rdtgrp)
2700          * in rdtgroup_kn_unlock()
2701          */
2702         kernfs_get(kn);
2703         kernfs_remove(rdtgrp->kn);
2704
2705         return 0;
2706 }
2707
2708 static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
2709                                 struct rdtgroup *rdtgrp)
2710 {
2711         rdtgrp->flags = RDT_DELETED;
2712         list_del(&rdtgrp->rdtgroup_list);
2713
2714         /*
2715          * one extra hold on this, will drop when we kfree(rdtgrp)
2716          * in rdtgroup_kn_unlock()
2717          */
2718         kernfs_get(kn);
2719         kernfs_remove(rdtgrp->kn);
2720         return 0;
2721 }
2722
2723 static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
2724                                cpumask_var_t tmpmask)
2725 {
2726         int cpu;
2727
2728         /* Give any tasks back to the default group */
2729         rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
2730
2731         /* Give any CPUs back to the default group */
2732         cpumask_or(&rdtgroup_default.cpu_mask,
2733                    &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2734
2735         /* Update per cpu closid and rmid of the moved CPUs first */
2736         for_each_cpu(cpu, &rdtgrp->cpu_mask) {
2737                 per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
2738                 per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
2739         }
2740
2741         /*
2742          * Update the MSR on moved CPUs and CPUs which have moved
2743          * task running on them.
2744          */
2745         cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
2746         update_closid_rmid(tmpmask, NULL);
2747
2748         closid_free(rdtgrp->closid);
2749         free_rmid(rdtgrp->mon.rmid);
2750
2751         /*
2752          * Free all the child monitor group rmids.
2753          */
2754         free_all_child_rdtgrp(rdtgrp);
2755
2756         rdtgroup_ctrl_remove(kn, rdtgrp);
2757
2758         return 0;
2759 }
2760
2761 static int rdtgroup_rmdir(struct kernfs_node *kn)
2762 {
2763         struct kernfs_node *parent_kn = kn->parent;
2764         struct rdtgroup *rdtgrp;
2765         cpumask_var_t tmpmask;
2766         int ret = 0;
2767
2768         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
2769                 return -ENOMEM;
2770
2771         rdtgrp = rdtgroup_kn_lock_live(kn);
2772         if (!rdtgrp) {
2773                 ret = -EPERM;
2774                 goto out;
2775         }
2776
2777         /*
2778          * If the rdtgroup is a ctrl_mon group and parent directory
2779          * is the root directory, remove the ctrl_mon group.
2780          *
2781          * If the rdtgroup is a mon group and parent directory
2782          * is a valid "mon_groups" directory, remove the mon group.
2783          */
2784         if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) {
2785                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2786                     rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
2787                         ret = rdtgroup_ctrl_remove(kn, rdtgrp);
2788                 } else {
2789                         ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
2790                 }
2791         } else if (rdtgrp->type == RDTMON_GROUP &&
2792                  is_mon_groups(parent_kn, kn->name)) {
2793                 ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
2794         } else {
2795                 ret = -EPERM;
2796         }
2797
2798 out:
2799         rdtgroup_kn_unlock(kn);
2800         free_cpumask_var(tmpmask);
2801         return ret;
2802 }
2803
2804 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
2805 {
2806         if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
2807                 seq_puts(seq, ",cdp");
2808         return 0;
2809 }
2810
2811 static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
2812         .mkdir          = rdtgroup_mkdir,
2813         .rmdir          = rdtgroup_rmdir,
2814         .show_options   = rdtgroup_show_options,
2815 };
2816
2817 static int __init rdtgroup_setup_root(void)
2818 {
2819         int ret;
2820
2821         rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
2822                                       KERNFS_ROOT_CREATE_DEACTIVATED |
2823                                       KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
2824                                       &rdtgroup_default);
2825         if (IS_ERR(rdt_root))
2826                 return PTR_ERR(rdt_root);
2827
2828         mutex_lock(&rdtgroup_mutex);
2829
2830         rdtgroup_default.closid = 0;
2831         rdtgroup_default.mon.rmid = 0;
2832         rdtgroup_default.type = RDTCTRL_GROUP;
2833         INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
2834
2835         list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
2836
2837         ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
2838         if (ret) {
2839                 kernfs_destroy_root(rdt_root);
2840                 goto out;
2841         }
2842
2843         rdtgroup_default.kn = rdt_root->kn;
2844         kernfs_activate(rdtgroup_default.kn);
2845
2846 out:
2847         mutex_unlock(&rdtgroup_mutex);
2848
2849         return ret;
2850 }
2851
2852 /*
2853  * rdtgroup_init - rdtgroup initialization
2854  *
2855  * Setup resctrl file system including set up root, create mount point,
2856  * register rdtgroup filesystem, and initialize files under root directory.
2857  *
2858  * Return: 0 on success or -errno
2859  */
2860 int __init rdtgroup_init(void)
2861 {
2862         int ret = 0;
2863
2864         seq_buf_init(&last_cmd_status, last_cmd_status_buf,
2865                      sizeof(last_cmd_status_buf));
2866
2867         ret = rdtgroup_setup_root();
2868         if (ret)
2869                 return ret;
2870
2871         ret = sysfs_create_mount_point(fs_kobj, "resctrl");
2872         if (ret)
2873                 goto cleanup_root;
2874
2875         ret = register_filesystem(&rdt_fs_type);
2876         if (ret)
2877                 goto cleanup_mountpoint;
2878
2879         /*
2880          * Adding the resctrl debugfs directory here may not be ideal since
2881          * it would let the resctrl debugfs directory appear on the debugfs
2882          * filesystem before the resctrl filesystem is mounted.
2883          * It may also be ok since that would enable debugging of RDT before
2884          * resctrl is mounted.
2885          * The reason why the debugfs directory is created here and not in
2886          * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
2887          * during the debugfs directory creation also &sb->s_type->i_mutex_key
2888          * (the lockdep class of inode->i_rwsem). Other filesystem
2889          * interactions (eg. SyS_getdents) have the lock ordering:
2890          * &sb->s_type->i_mutex_key --> &mm->mmap_sem
2891          * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
2892          * is taken, thus creating dependency:
2893          * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
2894          * issues considering the other two lock dependencies.
2895          * By creating the debugfs directory here we avoid a dependency
2896          * that may cause deadlock (even though file operations cannot
2897          * occur until the filesystem is mounted, but I do not know how to
2898          * tell lockdep that).
2899          */
2900         debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
2901
2902         return 0;
2903
2904 cleanup_mountpoint:
2905         sysfs_remove_mount_point(fs_kobj, "resctrl");
2906 cleanup_root:
2907         kernfs_destroy_root(rdt_root);
2908
2909         return ret;
2910 }
2911
2912 void __exit rdtgroup_exit(void)
2913 {
2914         debugfs_remove_recursive(debugfs_resctrl);
2915         unregister_filesystem(&rdt_fs_type);
2916         sysfs_remove_mount_point(fs_kobj, "resctrl");
2917         kernfs_destroy_root(rdt_root);
2918 }