1 // SPDX-License-Identifier: GPL-2.0
3 * Memory bandwidth monitoring and allocation library
5 * Copyright (C) 2018 Intel Corporation
8 * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
13 #define UNCORE_IMC "uncore_imc"
14 #define READ_FILE_NAME "events/cas_count_read"
15 #define WRITE_FILE_NAME "events/cas_count_write"
16 #define DYN_PMU_PATH "/sys/bus/event_source/devices"
17 #define SCALE 0.00006103515625
22 #define CON_MON_MBM_LOCAL_BYTES_PATH \
23 "%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
25 #define CON_MBM_LOCAL_BYTES_PATH \
26 "%s/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
28 #define MON_MBM_LOCAL_BYTES_PATH \
29 "%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
31 #define MBM_LOCAL_BYTES_PATH \
32 "%s/mon_data/mon_L3_%02d/mbm_local_bytes"
34 #define CON_MON_LCC_OCCUP_PATH \
35 "%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
37 #define CON_LCC_OCCUP_PATH \
38 "%s/%s/mon_data/mon_L3_%02d/llc_occupancy"
40 #define MON_LCC_OCCUP_PATH \
41 "%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
43 #define LCC_OCCUP_PATH \
44 "%s/mon_data/mon_L3_%02d/llc_occupancy"
46 struct membw_read_format {
47 __u64 value; /* The value of the event */
48 __u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
49 __u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
50 __u64 id; /* if PERF_FORMAT_ID */
53 struct imc_counter_config {
57 struct perf_event_attr pe;
58 struct membw_read_format return_value;
62 static char mbm_total_path[1024];
64 static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
66 void membw_initialize_perf_event_attr(int i, int j)
68 memset(&imc_counters_config[i][j].pe, 0,
69 sizeof(struct perf_event_attr));
70 imc_counters_config[i][j].pe.type = imc_counters_config[i][j].type;
71 imc_counters_config[i][j].pe.size = sizeof(struct perf_event_attr);
72 imc_counters_config[i][j].pe.disabled = 1;
73 imc_counters_config[i][j].pe.inherit = 1;
74 imc_counters_config[i][j].pe.exclude_guest = 0;
75 imc_counters_config[i][j].pe.config =
76 imc_counters_config[i][j].umask << 8 |
77 imc_counters_config[i][j].event;
78 imc_counters_config[i][j].pe.sample_type = PERF_SAMPLE_IDENTIFIER;
79 imc_counters_config[i][j].pe.read_format =
80 PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
83 void membw_ioctl_perf_event_ioc_reset_enable(int i, int j)
85 ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_RESET, 0);
86 ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_ENABLE, 0);
89 void membw_ioctl_perf_event_ioc_disable(int i, int j)
91 ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_DISABLE, 0);
95 * get_event_and_umask: Parse config into event and umask
96 * @cas_count_cfg: Config
98 * @op: Operation (read/write)
100 void get_event_and_umask(char *cas_count_cfg, int count, bool op)
102 char *token[MAX_TOKENS];
105 strcat(cas_count_cfg, ",");
106 token[0] = strtok(cas_count_cfg, "=,");
108 for (i = 1; i < MAX_TOKENS; i++)
109 token[i] = strtok(NULL, "=,");
111 for (i = 0; i < MAX_TOKENS; i++) {
114 if (strcmp(token[i], "event") == 0) {
116 imc_counters_config[count][READ].event =
117 strtol(token[i + 1], NULL, 16);
119 imc_counters_config[count][WRITE].event =
120 strtol(token[i + 1], NULL, 16);
122 if (strcmp(token[i], "umask") == 0) {
124 imc_counters_config[count][READ].umask =
125 strtol(token[i + 1], NULL, 16);
127 imc_counters_config[count][WRITE].umask =
128 strtol(token[i + 1], NULL, 16);
133 static int open_perf_event(int i, int cpu_no, int j)
135 imc_counters_config[i][j].fd =
136 perf_event_open(&imc_counters_config[i][j].pe, -1, cpu_no, -1,
137 PERF_FLAG_FD_CLOEXEC);
139 if (imc_counters_config[i][j].fd == -1) {
140 fprintf(stderr, "Error opening leader %llx\n",
141 imc_counters_config[i][j].pe.config);
149 /* Get type and config (read and write) of an iMC counter */
150 static int read_from_imc_dir(char *imc_dir, int count)
152 char cas_count_cfg[1024], imc_counter_cfg[1024], imc_counter_type[1024];
155 /* Get type of iMC counter */
156 sprintf(imc_counter_type, "%s%s", imc_dir, "type");
157 fp = fopen(imc_counter_type, "r");
159 perror("Failed to open imc counter type file");
163 if (fscanf(fp, "%u", &imc_counters_config[count][READ].type) <= 0) {
164 perror("Could not get imc type");
171 imc_counters_config[count][WRITE].type =
172 imc_counters_config[count][READ].type;
174 /* Get read config */
175 sprintf(imc_counter_cfg, "%s%s", imc_dir, READ_FILE_NAME);
176 fp = fopen(imc_counter_cfg, "r");
178 perror("Failed to open imc config file");
182 if (fscanf(fp, "%s", cas_count_cfg) <= 0) {
183 perror("Could not get imc cas count read");
190 get_event_and_umask(cas_count_cfg, count, READ);
192 /* Get write config */
193 sprintf(imc_counter_cfg, "%s%s", imc_dir, WRITE_FILE_NAME);
194 fp = fopen(imc_counter_cfg, "r");
196 perror("Failed to open imc config file");
200 if (fscanf(fp, "%s", cas_count_cfg) <= 0) {
201 perror("Could not get imc cas count write");
208 get_event_and_umask(cas_count_cfg, count, WRITE);
214 * A system can have 'n' number of iMC (Integrated Memory Controller)
215 * counters, get that 'n'. For each iMC counter get it's type and config.
216 * Also, each counter has two configs, one for read and the other for write.
217 * A config again has two parts, event and umask.
218 * Enumerate all these details into an array of structures.
220 * Return: >= 0 on success. < 0 on failure.
222 static int num_of_imcs(void)
224 char imc_dir[512], *temp;
225 unsigned int count = 0;
230 dp = opendir(DYN_PMU_PATH);
232 while ((ep = readdir(dp))) {
233 temp = strstr(ep->d_name, UNCORE_IMC);
238 * imc counters are named as "uncore_imc_<n>", hence
239 * increment the pointer to point to <n>. Note that
240 * sizeof(UNCORE_IMC) would count for null character as
241 * well and hence the last underscore character in
242 * uncore_imc'_' need not be counted.
244 temp = temp + sizeof(UNCORE_IMC);
247 * Some directories under "DYN_PMU_PATH" could have
248 * names like "uncore_imc_free_running", hence, check if
249 * first character is a numerical digit or not.
251 if (temp[0] >= '0' && temp[0] <= '9') {
252 sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH,
254 ret = read_from_imc_dir(imc_dir, count);
265 perror("Unable find iMC counters!\n");
270 perror("Unable to open PMU directory!\n");
278 static int initialize_mem_bw_imc(void)
282 imcs = num_of_imcs();
286 /* Initialize perf_event_attr structures for all iMC's */
287 for (imc = 0; imc < imcs; imc++) {
288 for (j = 0; j < 2; j++)
289 membw_initialize_perf_event_attr(imc, j);
296 * get_mem_bw_imc: Memory band width as reported by iMC counters
297 * @cpu_no: CPU number that the benchmark PID is binded to
298 * @bw_report: Bandwidth report type (reads, writes)
300 * Memory B/W utilized by a process on a socket can be calculated using
301 * iMC counters. Perf events are used to read these counters.
303 * Return: = 0 on success. < 0 on failure.
305 static int get_mem_bw_imc(int cpu_no, char *bw_report, float *bw_imc)
307 float reads, writes, of_mul_read, of_mul_write;
310 /* Start all iMC counters to log values (both read and write) */
311 reads = 0, writes = 0, of_mul_read = 1, of_mul_write = 1;
312 for (imc = 0; imc < imcs; imc++) {
313 for (j = 0; j < 2; j++) {
314 ret = open_perf_event(imc, cpu_no, j);
318 for (j = 0; j < 2; j++)
319 membw_ioctl_perf_event_ioc_reset_enable(imc, j);
324 /* Stop counters after a second to get results (both read and write) */
325 for (imc = 0; imc < imcs; imc++) {
326 for (j = 0; j < 2; j++)
327 membw_ioctl_perf_event_ioc_disable(imc, j);
331 * Get results which are stored in struct type imc_counter_config
332 * Take over flow into consideration before calculating total b/w
334 for (imc = 0; imc < imcs; imc++) {
335 struct imc_counter_config *r =
336 &imc_counters_config[imc][READ];
337 struct imc_counter_config *w =
338 &imc_counters_config[imc][WRITE];
340 if (read(r->fd, &r->return_value,
341 sizeof(struct membw_read_format)) == -1) {
342 perror("Couldn't get read b/w through iMC");
347 if (read(w->fd, &w->return_value,
348 sizeof(struct membw_read_format)) == -1) {
349 perror("Couldn't get write bw through iMC");
354 __u64 r_time_enabled = r->return_value.time_enabled;
355 __u64 r_time_running = r->return_value.time_running;
357 if (r_time_enabled != r_time_running)
358 of_mul_read = (float)r_time_enabled /
359 (float)r_time_running;
361 __u64 w_time_enabled = w->return_value.time_enabled;
362 __u64 w_time_running = w->return_value.time_running;
364 if (w_time_enabled != w_time_running)
365 of_mul_write = (float)w_time_enabled /
366 (float)w_time_running;
367 reads += r->return_value.value * of_mul_read * SCALE;
368 writes += w->return_value.value * of_mul_write * SCALE;
371 for (imc = 0; imc < imcs; imc++) {
372 close(imc_counters_config[imc][READ].fd);
373 close(imc_counters_config[imc][WRITE].fd);
376 if (strcmp(bw_report, "reads") == 0) {
381 if (strcmp(bw_report, "writes") == 0) {
386 *bw_imc = reads + writes;
390 void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id)
392 if (ctrlgrp && mongrp)
393 sprintf(mbm_total_path, CON_MON_MBM_LOCAL_BYTES_PATH,
394 RESCTRL_PATH, ctrlgrp, mongrp, resource_id);
395 else if (!ctrlgrp && mongrp)
396 sprintf(mbm_total_path, MON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
397 mongrp, resource_id);
398 else if (ctrlgrp && !mongrp)
399 sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
400 ctrlgrp, resource_id);
401 else if (!ctrlgrp && !mongrp)
402 sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
407 * initialize_mem_bw_resctrl: Appropriately populate "mbm_total_path"
408 * @ctrlgrp: Name of the control monitor group (con_mon grp)
409 * @mongrp: Name of the monitor group (mon grp)
410 * @cpu_no: CPU number that the benchmark PID is binded to
411 * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc)
413 static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp,
414 int cpu_no, char *resctrl_val)
418 if (get_resource_id(cpu_no, &resource_id) < 0) {
419 perror("Could not get resource_id");
423 if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)))
424 set_mbm_path(ctrlgrp, mongrp, resource_id);
426 if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
428 sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH,
429 RESCTRL_PATH, ctrlgrp, resource_id);
431 sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH,
432 RESCTRL_PATH, resource_id);
437 * Get MBM Local bytes as reported by resctrl FS
439 * 1. If con_mon grp and mon grp are given, then read from con_mon grp's mon grp
440 * 2. If only con_mon grp is given, then read from con_mon grp
441 * 3. If both are not given, then read from root con_mon grp
443 * 1. If con_mon grp is given, then read from it
444 * 2. If con_mon grp is not given, then read from root con_mon grp
446 static int get_mem_bw_resctrl(unsigned long *mbm_total)
450 fp = fopen(mbm_total_path, "r");
452 perror("Failed to open total bw file");
456 if (fscanf(fp, "%lu", mbm_total) <= 0) {
457 perror("Could not get mbm local bytes");
469 void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
471 kill(bm_pid, SIGKILL);
474 ksft_print_msg("Ending\n\n");
480 * Register CTRL-C handler for parent, as it has to kill
481 * child process before exiting.
483 int signal_handler_register(void)
485 struct sigaction sigact;
488 sigact.sa_sigaction = ctrlc_handler;
489 sigemptyset(&sigact.sa_mask);
490 sigact.sa_flags = SA_SIGINFO;
491 if (sigaction(SIGINT, &sigact, NULL) ||
492 sigaction(SIGTERM, &sigact, NULL) ||
493 sigaction(SIGHUP, &sigact, NULL)) {
494 perror("# sigaction");
501 * Reset signal handler to SIG_DFL.
502 * Non-Value return because the caller should keep
503 * the error code of other path even if sigaction fails.
505 void signal_handler_unregister(void)
507 struct sigaction sigact;
509 sigact.sa_handler = SIG_DFL;
510 sigemptyset(&sigact.sa_mask);
511 if (sigaction(SIGINT, &sigact, NULL) ||
512 sigaction(SIGTERM, &sigact, NULL) ||
513 sigaction(SIGHUP, &sigact, NULL)) {
514 perror("# sigaction");
519 * print_results_bw: the memory bandwidth results are stored in a file
520 * @filename: file that stores the results
521 * @bm_pid: child pid that runs benchmark
522 * @bw_imc: perf imc counter value
523 * @bw_resc: memory bandwidth value
525 * Return: 0 on success. non-zero on failure.
527 static int print_results_bw(char *filename, int bm_pid, float bw_imc,
528 unsigned long bw_resc)
530 unsigned long diff = fabs(bw_imc - bw_resc);
533 if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) {
534 printf("Pid: %d \t Mem_BW_iMC: %f \t ", bm_pid, bw_imc);
535 printf("Mem_BW_resc: %lu \t Difference: %lu\n", bw_resc, diff);
537 fp = fopen(filename, "a");
539 perror("Cannot open results file");
543 if (fprintf(fp, "Pid: %d \t Mem_BW_iMC: %f \t Mem_BW_resc: %lu \t Difference: %lu\n",
544 bm_pid, bw_imc, bw_resc, diff) <= 0) {
546 perror("Could not log results.");
556 static void set_cmt_path(const char *ctrlgrp, const char *mongrp, char sock_num)
558 if (strlen(ctrlgrp) && strlen(mongrp))
559 sprintf(llc_occup_path, CON_MON_LCC_OCCUP_PATH, RESCTRL_PATH,
560 ctrlgrp, mongrp, sock_num);
561 else if (!strlen(ctrlgrp) && strlen(mongrp))
562 sprintf(llc_occup_path, MON_LCC_OCCUP_PATH, RESCTRL_PATH,
564 else if (strlen(ctrlgrp) && !strlen(mongrp))
565 sprintf(llc_occup_path, CON_LCC_OCCUP_PATH, RESCTRL_PATH,
567 else if (!strlen(ctrlgrp) && !strlen(mongrp))
568 sprintf(llc_occup_path, LCC_OCCUP_PATH, RESCTRL_PATH, sock_num);
572 * initialize_llc_occu_resctrl: Appropriately populate "llc_occup_path"
573 * @ctrlgrp: Name of the control monitor group (con_mon grp)
574 * @mongrp: Name of the monitor group (mon grp)
575 * @cpu_no: CPU number that the benchmark PID is binded to
576 * @resctrl_val: Resctrl feature (Eg: cat, cmt.. etc)
578 static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
579 int cpu_no, char *resctrl_val)
583 if (get_resource_id(cpu_no, &resource_id) < 0) {
584 perror("# Unable to resource_id");
588 if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
589 set_cmt_path(ctrlgrp, mongrp, resource_id);
593 measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start)
595 unsigned long bw_resc, bw_resc_end;
600 * Measure memory bandwidth from resctrl and from
601 * another source which is perf imc value or could
602 * be something else if perf imc event is not available.
603 * Compare the two values to validate resctrl value.
604 * It takes 1sec to measure the data.
606 ret = get_mem_bw_imc(param->cpu_no, param->bw_report, &bw_imc);
610 ret = get_mem_bw_resctrl(&bw_resc_end);
614 bw_resc = (bw_resc_end - *bw_resc_start) / MB;
615 ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc);
619 *bw_resc_start = bw_resc_end;
625 * resctrl_val: execute benchmark and measure memory bandwidth on
627 * @benchmark_cmd: benchmark command and its arguments
628 * @param: parameters passed to resctrl_val()
630 * Return: 0 on success. non-zero on failure.
632 int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
634 char *resctrl_val = param->resctrl_val;
635 unsigned long bw_resc_start = 0;
636 struct sigaction sigact;
637 int ret = 0, pipefd[2];
638 char pipe_message = 0;
641 if (strcmp(param->filename, "") == 0)
642 sprintf(param->filename, "stdio");
644 if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) ||
645 !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
646 ret = validate_bw_report_request(param->bw_report);
652 * If benchmark wasn't successfully started by child, then child should
653 * kill parent, so save parent's pid
658 perror("# Unable to create pipe");
664 * Fork to start benchmark, save child's pid so that it can be killed
670 perror("# Unable to fork");
677 * Mask all signals except SIGUSR1, parent uses SIGUSR1 to
680 sigfillset(&sigact.sa_mask);
681 sigdelset(&sigact.sa_mask, SIGUSR1);
683 sigact.sa_sigaction = run_benchmark;
684 sigact.sa_flags = SA_SIGINFO;
686 /* Register for "SIGUSR1" signal from parent */
687 if (sigaction(SIGUSR1, &sigact, NULL))
688 PARENT_EXIT("Can't register child for signal");
690 /* Tell parent that child is ready */
693 if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
694 sizeof(pipe_message)) {
695 perror("# failed signaling parent process");
701 /* Suspend child until delivery of "SIGUSR1" from parent */
702 sigsuspend(&sigact.sa_mask);
704 PARENT_EXIT("Child is done");
707 ksft_print_msg("Benchmark PID: %d\n", bm_pid);
709 ret = signal_handler_register();
713 value.sival_ptr = benchmark_cmd;
715 /* Taskset benchmark to specified cpu */
716 ret = taskset_benchmark(bm_pid, param->cpu_no);
720 /* Write benchmark to specified control&monitoring grp in resctrl FS */
721 ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp,
726 if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
727 !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
728 ret = initialize_mem_bw_imc();
732 initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp,
733 param->cpu_no, resctrl_val);
734 } else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
735 initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp,
736 param->cpu_no, resctrl_val);
738 /* Parent waits for child to be ready. */
740 while (pipe_message != 1) {
741 if (read(pipefd[0], &pipe_message, sizeof(pipe_message)) <
742 sizeof(pipe_message)) {
743 perror("# failed reading message from child process");
750 /* Signal child to start benchmark */
751 if (sigqueue(bm_pid, SIGUSR1, value) == -1) {
752 perror("# sigqueue SIGUSR1 to child");
757 /* Give benchmark enough time to fully run */
760 /* Test runs until the callback setup() tells the test to stop. */
762 ret = param->setup(param);
763 if (ret == END_OF_TESTS) {
770 if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
771 !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
772 ret = measure_vals(param, &bw_resc_start);
775 } else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) {
777 ret = measure_cache_vals(param, bm_pid);
784 signal_handler_unregister();
786 kill(bm_pid, SIGKILL);