ALSA: mips: Convert to the common vmalloc memalloc
[sfrench/cifs-2.6.git] / arch / powerpc / oprofile / op_model_cell.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Cell Broadband Engine OProfile Support
4  *
5  * (C) Copyright IBM Corporation 2006
6  *
7  * Author: David Erb (djerb@us.ibm.com)
8  * Modifications:
9  *         Carl Love <carll@us.ibm.com>
10  *         Maynard Johnson <maynardj@us.ibm.com>
11  */
12
13 #include <linux/cpufreq.h>
14 #include <linux/delay.h>
15 #include <linux/jiffies.h>
16 #include <linux/kthread.h>
17 #include <linux/oprofile.h>
18 #include <linux/percpu.h>
19 #include <linux/smp.h>
20 #include <linux/spinlock.h>
21 #include <linux/timer.h>
22 #include <asm/cell-pmu.h>
23 #include <asm/cputable.h>
24 #include <asm/firmware.h>
25 #include <asm/io.h>
26 #include <asm/oprofile_impl.h>
27 #include <asm/processor.h>
28 #include <asm/prom.h>
29 #include <asm/ptrace.h>
30 #include <asm/reg.h>
31 #include <asm/rtas.h>
32 #include <asm/cell-regs.h>
33
34 #include "../platforms/cell/interrupt.h"
35 #include "cell/pr_util.h"
36
37 #define PPU_PROFILING            0
38 #define SPU_PROFILING_CYCLES     1
39 #define SPU_PROFILING_EVENTS     2
40
41 #define SPU_EVENT_NUM_START      4100
42 #define SPU_EVENT_NUM_STOP       4399
43 #define SPU_PROFILE_EVENT_ADDR          4363  /* spu, address trace, decimal */
44 #define SPU_PROFILE_EVENT_ADDR_MASK_A   0x146 /* sub unit set to zero */
45 #define SPU_PROFILE_EVENT_ADDR_MASK_B   0x186 /* sub unit set to zero */
46
47 #define NUM_SPUS_PER_NODE    8
48 #define SPU_CYCLES_EVENT_NUM 2  /*  event number for SPU_CYCLES */
49
50 #define PPU_CYCLES_EVENT_NUM 1  /*  event number for CYCLES */
51 #define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
52                                  * PPU_CYCLES event
53                                  */
54 #define CBE_COUNT_ALL_CYCLES 0x42800000 /* PPU cycle event specifier */
55
56 #define NUM_THREADS 2         /* number of physical threads in
57                                * physical processor
58                                */
59 #define NUM_DEBUG_BUS_WORDS 4
60 #define NUM_INPUT_BUS_WORDS 2
61
62 #define MAX_SPU_COUNT 0xFFFFFF  /* maximum 24 bit LFSR value */
63
64 /* Minimum HW interval timer setting to send value to trace buffer is 10 cycle.
65  * To configure counter to send value every N cycles set counter to
66  * 2^32 - 1 - N.
67  */
68 #define NUM_INTERVAL_CYC  0xFFFFFFFF - 10
69
70 /*
71  * spu_cycle_reset is the number of cycles between samples.
72  * This variable is used for SPU profiling and should ONLY be set
73  * at the beginning of cell_reg_setup; otherwise, it's read-only.
74  */
75 static unsigned int spu_cycle_reset;
76 static unsigned int profiling_mode;
77 static int spu_evnt_phys_spu_indx;
78
79 struct pmc_cntrl_data {
80         unsigned long vcntr;
81         unsigned long evnts;
82         unsigned long masks;
83         unsigned long enabled;
84 };
85
86 /*
87  * ibm,cbe-perftools rtas parameters
88  */
89 struct pm_signal {
90         u16 cpu;                /* Processor to modify */
91         u16 sub_unit;           /* hw subunit this applies to (if applicable)*/
92         short int signal_group; /* Signal Group to Enable/Disable */
93         u8 bus_word;            /* Enable/Disable on this Trace/Trigger/Event
94                                  * Bus Word(s) (bitmask)
95                                  */
96         u8 bit;                 /* Trigger/Event bit (if applicable) */
97 };
98
99 /*
100  * rtas call arguments
101  */
102 enum {
103         SUBFUNC_RESET = 1,
104         SUBFUNC_ACTIVATE = 2,
105         SUBFUNC_DEACTIVATE = 3,
106
107         PASSTHRU_IGNORE = 0,
108         PASSTHRU_ENABLE = 1,
109         PASSTHRU_DISABLE = 2,
110 };
111
112 struct pm_cntrl {
113         u16 enable;
114         u16 stop_at_max;
115         u16 trace_mode;
116         u16 freeze;
117         u16 count_mode;
118         u16 spu_addr_trace;
119         u8  trace_buf_ovflw;
120 };
121
122 static struct {
123         u32 group_control;
124         u32 debug_bus_control;
125         struct pm_cntrl pm_cntrl;
126         u32 pm07_cntrl[NR_PHYS_CTRS];
127 } pm_regs;
128
129 #define GET_SUB_UNIT(x) ((x & 0x0000f000) >> 12)
130 #define GET_BUS_WORD(x) ((x & 0x000000f0) >> 4)
131 #define GET_BUS_TYPE(x) ((x & 0x00000300) >> 8)
132 #define GET_POLARITY(x) ((x & 0x00000002) >> 1)
133 #define GET_COUNT_CYCLES(x) (x & 0x00000001)
134 #define GET_INPUT_CONTROL(x) ((x & 0x00000004) >> 2)
135
136 static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values);
137 static unsigned long spu_pm_cnt[MAX_NUMNODES * NUM_SPUS_PER_NODE];
138 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
139
140 /*
141  * The CELL profiling code makes rtas calls to setup the debug bus to
142  * route the performance signals.  Additionally, SPU profiling requires
143  * a second rtas call to setup the hardware to capture the SPU PCs.
144  * The EIO error value is returned if the token lookups or the rtas
145  * call fail.  The EIO error number is the best choice of the existing
146  * error numbers.  The probability of rtas related error is very low.  But
147  * by returning EIO and printing additional information to dmsg the user
148  * will know that OProfile did not start and dmesg will tell them why.
149  * OProfile does not support returning errors on Stop.  Not a huge issue
150  * since failure to reset the debug bus or stop the SPU PC collection is
151  * not a fatel issue.  Chances are if the Stop failed, Start doesn't work
152  * either.
153  */
154
155 /*
156  * Interpetation of hdw_thread:
157  * 0 - even virtual cpus 0, 2, 4,...
158  * 1 - odd virtual cpus 1, 3, 5, ...
159  *
160  * FIXME: this is strictly wrong, we need to clean this up in a number
161  * of places. It works for now. -arnd
162  */
163 static u32 hdw_thread;
164
165 static u32 virt_cntr_inter_mask;
166 static struct timer_list timer_virt_cntr;
167 static struct timer_list timer_spu_event_swap;
168
169 /*
170  * pm_signal needs to be global since it is initialized in
171  * cell_reg_setup at the time when the necessary information
172  * is available.
173  */
174 static struct pm_signal pm_signal[NR_PHYS_CTRS];
175 static int pm_rtas_token;    /* token for debug bus setup call */
176 static int spu_rtas_token;   /* token for SPU cycle profiling */
177
178 static u32 reset_value[NR_PHYS_CTRS];
179 static int num_counters;
180 static int oprofile_running;
181 static DEFINE_SPINLOCK(cntr_lock);
182
183 static u32 ctr_enabled;
184
185 static unsigned char input_bus[NUM_INPUT_BUS_WORDS];
186
187 /*
188  * Firmware interface functions
189  */
190 static int
191 rtas_ibm_cbe_perftools(int subfunc, int passthru,
192                        void *address, unsigned long length)
193 {
194         u64 paddr = __pa(address);
195
196         return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc,
197                          passthru, paddr >> 32, paddr & 0xffffffff, length);
198 }
199
200 static void pm_rtas_reset_signals(u32 node)
201 {
202         int ret;
203         struct pm_signal pm_signal_local;
204
205         /*
206          * The debug bus is being set to the passthru disable state.
207          * However, the FW still expects at least one legal signal routing
208          * entry or it will return an error on the arguments.   If we don't
209          * supply a valid entry, we must ignore all return values.  Ignoring
210          * all return values means we might miss an error we should be
211          * concerned about.
212          */
213
214         /*  fw expects physical cpu #. */
215         pm_signal_local.cpu = node;
216         pm_signal_local.signal_group = 21;
217         pm_signal_local.bus_word = 1;
218         pm_signal_local.sub_unit = 0;
219         pm_signal_local.bit = 0;
220
221         ret = rtas_ibm_cbe_perftools(SUBFUNC_RESET, PASSTHRU_DISABLE,
222                                      &pm_signal_local,
223                                      sizeof(struct pm_signal));
224
225         if (unlikely(ret))
226                 /*
227                  * Not a fatal error. For Oprofile stop, the oprofile
228                  * functions do not support returning an error for
229                  * failure to stop OProfile.
230                  */
231                 printk(KERN_WARNING "%s: rtas returned: %d\n",
232                        __func__, ret);
233 }
234
235 static int pm_rtas_activate_signals(u32 node, u32 count)
236 {
237         int ret;
238         int i, j;
239         struct pm_signal pm_signal_local[NR_PHYS_CTRS];
240
241         /*
242          * There is no debug setup required for the cycles event.
243          * Note that only events in the same group can be used.
244          * Otherwise, there will be conflicts in correctly routing
245          * the signals on the debug bus.  It is the responsibility
246          * of the OProfile user tool to check the events are in
247          * the same group.
248          */
249         i = 0;
250         for (j = 0; j < count; j++) {
251                 if (pm_signal[j].signal_group != PPU_CYCLES_GRP_NUM) {
252
253                         /* fw expects physical cpu # */
254                         pm_signal_local[i].cpu = node;
255                         pm_signal_local[i].signal_group
256                                 = pm_signal[j].signal_group;
257                         pm_signal_local[i].bus_word = pm_signal[j].bus_word;
258                         pm_signal_local[i].sub_unit = pm_signal[j].sub_unit;
259                         pm_signal_local[i].bit = pm_signal[j].bit;
260                         i++;
261                 }
262         }
263
264         if (i != 0) {
265                 ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE, PASSTHRU_ENABLE,
266                                              pm_signal_local,
267                                              i * sizeof(struct pm_signal));
268
269                 if (unlikely(ret)) {
270                         printk(KERN_WARNING "%s: rtas returned: %d\n",
271                                __func__, ret);
272                         return -EIO;
273                 }
274         }
275
276         return 0;
277 }
278
279 /*
280  * PM Signal functions
281  */
282 static void set_pm_event(u32 ctr, int event, u32 unit_mask)
283 {
284         struct pm_signal *p;
285         u32 signal_bit;
286         u32 bus_word, bus_type, count_cycles, polarity, input_control;
287         int j, i;
288
289         if (event == PPU_CYCLES_EVENT_NUM) {
290                 /* Special Event: Count all cpu cycles */
291                 pm_regs.pm07_cntrl[ctr] = CBE_COUNT_ALL_CYCLES;
292                 p = &(pm_signal[ctr]);
293                 p->signal_group = PPU_CYCLES_GRP_NUM;
294                 p->bus_word = 1;
295                 p->sub_unit = 0;
296                 p->bit = 0;
297                 goto out;
298         } else {
299                 pm_regs.pm07_cntrl[ctr] = 0;
300         }
301
302         bus_word = GET_BUS_WORD(unit_mask);
303         bus_type = GET_BUS_TYPE(unit_mask);
304         count_cycles = GET_COUNT_CYCLES(unit_mask);
305         polarity = GET_POLARITY(unit_mask);
306         input_control = GET_INPUT_CONTROL(unit_mask);
307         signal_bit = (event % 100);
308
309         p = &(pm_signal[ctr]);
310
311         p->signal_group = event / 100;
312         p->bus_word = bus_word;
313         p->sub_unit = GET_SUB_UNIT(unit_mask);
314
315         pm_regs.pm07_cntrl[ctr] = 0;
316         pm_regs.pm07_cntrl[ctr] |= PM07_CTR_COUNT_CYCLES(count_cycles);
317         pm_regs.pm07_cntrl[ctr] |= PM07_CTR_POLARITY(polarity);
318         pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_CONTROL(input_control);
319
320         /*
321          * Some of the islands signal selection is based on 64 bit words.
322          * The debug bus words are 32 bits, the input words to the performance
323          * counters are defined as 32 bits.  Need to convert the 64 bit island
324          * specification to the appropriate 32 input bit and bus word for the
325          * performance counter event selection.  See the CELL Performance
326          * monitoring signals manual and the Perf cntr hardware descriptions
327          * for the details.
328          */
329         if (input_control == 0) {
330                 if (signal_bit > 31) {
331                         signal_bit -= 32;
332                         if (bus_word == 0x3)
333                                 bus_word = 0x2;
334                         else if (bus_word == 0xc)
335                                 bus_word = 0x8;
336                 }
337
338                 if ((bus_type == 0) && p->signal_group >= 60)
339                         bus_type = 2;
340                 if ((bus_type == 1) && p->signal_group >= 50)
341                         bus_type = 0;
342
343                 pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_MUX(signal_bit);
344         } else {
345                 pm_regs.pm07_cntrl[ctr] = 0;
346                 p->bit = signal_bit;
347         }
348
349         for (i = 0; i < NUM_DEBUG_BUS_WORDS; i++) {
350                 if (bus_word & (1 << i)) {
351                         pm_regs.debug_bus_control |=
352                                 (bus_type << (30 - (2 * i)));
353
354                         for (j = 0; j < NUM_INPUT_BUS_WORDS; j++) {
355                                 if (input_bus[j] == 0xff) {
356                                         input_bus[j] = i;
357                                         pm_regs.group_control |=
358                                                 (i << (30 - (2 * j)));
359
360                                         break;
361                                 }
362                         }
363                 }
364         }
365 out:
366         ;
367 }
368
369 static void write_pm_cntrl(int cpu)
370 {
371         /*
372          * Oprofile will use 32 bit counters, set bits 7:10 to 0
373          * pmregs.pm_cntrl is a global
374          */
375
376         u32 val = 0;
377         if (pm_regs.pm_cntrl.enable == 1)
378                 val |= CBE_PM_ENABLE_PERF_MON;
379
380         if (pm_regs.pm_cntrl.stop_at_max == 1)
381                 val |= CBE_PM_STOP_AT_MAX;
382
383         if (pm_regs.pm_cntrl.trace_mode != 0)
384                 val |= CBE_PM_TRACE_MODE_SET(pm_regs.pm_cntrl.trace_mode);
385
386         if (pm_regs.pm_cntrl.trace_buf_ovflw == 1)
387                 val |= CBE_PM_TRACE_BUF_OVFLW(pm_regs.pm_cntrl.trace_buf_ovflw);
388         if (pm_regs.pm_cntrl.freeze == 1)
389                 val |= CBE_PM_FREEZE_ALL_CTRS;
390
391         val |= CBE_PM_SPU_ADDR_TRACE_SET(pm_regs.pm_cntrl.spu_addr_trace);
392
393         /*
394          * Routine set_count_mode must be called previously to set
395          * the count mode based on the user selection of user and kernel.
396          */
397         val |= CBE_PM_COUNT_MODE_SET(pm_regs.pm_cntrl.count_mode);
398         cbe_write_pm(cpu, pm_control, val);
399 }
400
401 static inline void
402 set_count_mode(u32 kernel, u32 user)
403 {
404         /*
405          * The user must specify user and kernel if they want them. If
406          *  neither is specified, OProfile will count in hypervisor mode.
407          *  pm_regs.pm_cntrl is a global
408          */
409         if (kernel) {
410                 if (user)
411                         pm_regs.pm_cntrl.count_mode = CBE_COUNT_ALL_MODES;
412                 else
413                         pm_regs.pm_cntrl.count_mode =
414                                 CBE_COUNT_SUPERVISOR_MODE;
415         } else {
416                 if (user)
417                         pm_regs.pm_cntrl.count_mode = CBE_COUNT_PROBLEM_MODE;
418                 else
419                         pm_regs.pm_cntrl.count_mode =
420                                 CBE_COUNT_HYPERVISOR_MODE;
421         }
422 }
423
424 static inline void enable_ctr(u32 cpu, u32 ctr, u32 *pm07_cntrl)
425 {
426
427         pm07_cntrl[ctr] |= CBE_PM_CTR_ENABLE;
428         cbe_write_pm07_control(cpu, ctr, pm07_cntrl[ctr]);
429 }
430
431 /*
432  * Oprofile is expected to collect data on all CPUs simultaneously.
433  * However, there is one set of performance counters per node.  There are
434  * two hardware threads or virtual CPUs on each node.  Hence, OProfile must
435  * multiplex in time the performance counter collection on the two virtual
436  * CPUs.  The multiplexing of the performance counters is done by this
437  * virtual counter routine.
438  *
439  * The pmc_values used below is defined as 'per-cpu' but its use is
440  * more akin to 'per-node'.  We need to store two sets of counter
441  * values per node -- one for the previous run and one for the next.
442  * The per-cpu[NR_PHYS_CTRS] gives us the storage we need.  Each odd/even
443  * pair of per-cpu arrays is used for storing the previous and next
444  * pmc values for a given node.
445  * NOTE: We use the per-cpu variable to improve cache performance.
446  *
447  * This routine will alternate loading the virtual counters for
448  * virtual CPUs
449  */
450 static void cell_virtual_cntr(struct timer_list *unused)
451 {
452         int i, prev_hdw_thread, next_hdw_thread;
453         u32 cpu;
454         unsigned long flags;
455
456         /*
457          * Make sure that the interrupt_hander and the virt counter are
458          * not both playing with the counters on the same node.
459          */
460
461         spin_lock_irqsave(&cntr_lock, flags);
462
463         prev_hdw_thread = hdw_thread;
464
465         /* switch the cpu handling the interrupts */
466         hdw_thread = 1 ^ hdw_thread;
467         next_hdw_thread = hdw_thread;
468
469         pm_regs.group_control = 0;
470         pm_regs.debug_bus_control = 0;
471
472         for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
473                 input_bus[i] = 0xff;
474
475         /*
476          * There are some per thread events.  Must do the
477          * set event, for the thread that is being started
478          */
479         for (i = 0; i < num_counters; i++)
480                 set_pm_event(i,
481                         pmc_cntrl[next_hdw_thread][i].evnts,
482                         pmc_cntrl[next_hdw_thread][i].masks);
483
484         /*
485          * The following is done only once per each node, but
486          * we need cpu #, not node #, to pass to the cbe_xxx functions.
487          */
488         for_each_online_cpu(cpu) {
489                 if (cbe_get_hw_thread_id(cpu))
490                         continue;
491
492                 /*
493                  * stop counters, save counter values, restore counts
494                  * for previous thread
495                  */
496                 cbe_disable_pm(cpu);
497                 cbe_disable_pm_interrupts(cpu);
498                 for (i = 0; i < num_counters; i++) {
499                         per_cpu(pmc_values, cpu + prev_hdw_thread)[i]
500                                 = cbe_read_ctr(cpu, i);
501
502                         if (per_cpu(pmc_values, cpu + next_hdw_thread)[i]
503                             == 0xFFFFFFFF)
504                                 /* If the cntr value is 0xffffffff, we must
505                                  * reset that to 0xfffffff0 when the current
506                                  * thread is restarted.  This will generate a
507                                  * new interrupt and make sure that we never
508                                  * restore the counters to the max value.  If
509                                  * the counters were restored to the max value,
510                                  * they do not increment and no interrupts are
511                                  * generated.  Hence no more samples will be
512                                  * collected on that cpu.
513                                  */
514                                 cbe_write_ctr(cpu, i, 0xFFFFFFF0);
515                         else
516                                 cbe_write_ctr(cpu, i,
517                                               per_cpu(pmc_values,
518                                                       cpu +
519                                                       next_hdw_thread)[i]);
520                 }
521
522                 /*
523                  * Switch to the other thread. Change the interrupt
524                  * and control regs to be scheduled on the CPU
525                  * corresponding to the thread to execute.
526                  */
527                 for (i = 0; i < num_counters; i++) {
528                         if (pmc_cntrl[next_hdw_thread][i].enabled) {
529                                 /*
530                                  * There are some per thread events.
531                                  * Must do the set event, enable_cntr
532                                  * for each cpu.
533                                  */
534                                 enable_ctr(cpu, i,
535                                            pm_regs.pm07_cntrl);
536                         } else {
537                                 cbe_write_pm07_control(cpu, i, 0);
538                         }
539                 }
540
541                 /* Enable interrupts on the CPU thread that is starting */
542                 cbe_enable_pm_interrupts(cpu, next_hdw_thread,
543                                          virt_cntr_inter_mask);
544                 cbe_enable_pm(cpu);
545         }
546
547         spin_unlock_irqrestore(&cntr_lock, flags);
548
549         mod_timer(&timer_virt_cntr, jiffies + HZ / 10);
550 }
551
552 static void start_virt_cntrs(void)
553 {
554         timer_setup(&timer_virt_cntr, cell_virtual_cntr, 0);
555         timer_virt_cntr.expires = jiffies + HZ / 10;
556         add_timer(&timer_virt_cntr);
557 }
558
559 static int cell_reg_setup_spu_cycles(struct op_counter_config *ctr,
560                         struct op_system_config *sys, int num_ctrs)
561 {
562         spu_cycle_reset = ctr[0].count;
563
564         /*
565          * Each node will need to make the rtas call to start
566          * and stop SPU profiling.  Get the token once and store it.
567          */
568         spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
569
570         if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) {
571                 printk(KERN_ERR
572                        "%s: rtas token ibm,cbe-spu-perftools unknown\n",
573                        __func__);
574                 return -EIO;
575         }
576         return 0;
577 }
578
579 /* Unfortunately, the hardware will only support event profiling
580  * on one SPU per node at a time.  Therefore, we must time slice
581  * the profiling across all SPUs in the node.  Note, we do this
582  * in parallel for each node.  The following routine is called
583  * periodically based on kernel timer to switch which SPU is
584  * being monitored in a round robbin fashion.
585  */
586 static void spu_evnt_swap(struct timer_list *unused)
587 {
588         int node;
589         int cur_phys_spu, nxt_phys_spu, cur_spu_evnt_phys_spu_indx;
590         unsigned long flags;
591         int cpu;
592         int ret;
593         u32 interrupt_mask;
594
595
596         /* enable interrupts on cntr 0 */
597         interrupt_mask = CBE_PM_CTR_OVERFLOW_INTR(0);
598
599         hdw_thread = 0;
600
601         /* Make sure spu event interrupt handler and spu event swap
602          * don't access the counters simultaneously.
603          */
604         spin_lock_irqsave(&cntr_lock, flags);
605
606         cur_spu_evnt_phys_spu_indx = spu_evnt_phys_spu_indx;
607
608         if (++(spu_evnt_phys_spu_indx) == NUM_SPUS_PER_NODE)
609                 spu_evnt_phys_spu_indx = 0;
610
611         pm_signal[0].sub_unit = spu_evnt_phys_spu_indx;
612         pm_signal[1].sub_unit = spu_evnt_phys_spu_indx;
613         pm_signal[2].sub_unit = spu_evnt_phys_spu_indx;
614
615         /* switch the SPU being profiled on each node */
616         for_each_online_cpu(cpu) {
617                 if (cbe_get_hw_thread_id(cpu))
618                         continue;
619
620                 node = cbe_cpu_to_node(cpu);
621                 cur_phys_spu = (node * NUM_SPUS_PER_NODE)
622                         + cur_spu_evnt_phys_spu_indx;
623                 nxt_phys_spu = (node * NUM_SPUS_PER_NODE)
624                         + spu_evnt_phys_spu_indx;
625
626                 /*
627                  * stop counters, save counter values, restore counts
628                  * for previous physical SPU
629                  */
630                 cbe_disable_pm(cpu);
631                 cbe_disable_pm_interrupts(cpu);
632
633                 spu_pm_cnt[cur_phys_spu]
634                         = cbe_read_ctr(cpu, 0);
635
636                 /* restore previous count for the next spu to sample */
637                 /* NOTE, hardware issue, counter will not start if the
638                  * counter value is at max (0xFFFFFFFF).
639                  */
640                 if (spu_pm_cnt[nxt_phys_spu] >= 0xFFFFFFFF)
641                         cbe_write_ctr(cpu, 0, 0xFFFFFFF0);
642                  else
643                          cbe_write_ctr(cpu, 0, spu_pm_cnt[nxt_phys_spu]);
644
645                 pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
646
647                 /* setup the debug bus measure the one event and
648                  * the two events to route the next SPU's PC on
649                  * the debug bus
650                  */
651                 ret = pm_rtas_activate_signals(cbe_cpu_to_node(cpu), 3);
652                 if (ret)
653                         printk(KERN_ERR "%s: pm_rtas_activate_signals failed, "
654                                "SPU event swap\n", __func__);
655
656                 /* clear the trace buffer, don't want to take PC for
657                  * previous SPU*/
658                 cbe_write_pm(cpu, trace_address, 0);
659
660                 enable_ctr(cpu, 0, pm_regs.pm07_cntrl);
661
662                 /* Enable interrupts on the CPU thread that is starting */
663                 cbe_enable_pm_interrupts(cpu, hdw_thread,
664                                          interrupt_mask);
665                 cbe_enable_pm(cpu);
666         }
667
668         spin_unlock_irqrestore(&cntr_lock, flags);
669
670         /* swap approximately every 0.1 seconds */
671         mod_timer(&timer_spu_event_swap, jiffies + HZ / 25);
672 }
673
674 static void start_spu_event_swap(void)
675 {
676         timer_setup(&timer_spu_event_swap, spu_evnt_swap, 0);
677         timer_spu_event_swap.expires = jiffies + HZ / 25;
678         add_timer(&timer_spu_event_swap);
679 }
680
681 static int cell_reg_setup_spu_events(struct op_counter_config *ctr,
682                         struct op_system_config *sys, int num_ctrs)
683 {
684         int i;
685
686         /* routine is called once for all nodes */
687
688         spu_evnt_phys_spu_indx = 0;
689         /*
690          * For all events except PPU CYCLEs, each node will need to make
691          * the rtas cbe-perftools call to setup and reset the debug bus.
692          * Make the token lookup call once and store it in the global
693          * variable pm_rtas_token.
694          */
695         pm_rtas_token = rtas_token("ibm,cbe-perftools");
696
697         if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
698                 printk(KERN_ERR
699                        "%s: rtas token ibm,cbe-perftools unknown\n",
700                        __func__);
701                 return -EIO;
702         }
703
704         /* setup the pm_control register settings,
705          * settings will be written per node by the
706          * cell_cpu_setup() function.
707          */
708         pm_regs.pm_cntrl.trace_buf_ovflw = 1;
709
710         /* Use the occurrence trace mode to have SPU PC saved
711          * to the trace buffer.  Occurrence data in trace buffer
712          * is not used.  Bit 2 must be set to store SPU addresses.
713          */
714         pm_regs.pm_cntrl.trace_mode = 2;
715
716         pm_regs.pm_cntrl.spu_addr_trace = 0x1;  /* using debug bus
717                                                    event 2 & 3 */
718
719         /* setup the debug bus event array with the SPU PC routing events.
720         *  Note, pm_signal[0] will be filled in by set_pm_event() call below.
721         */
722         pm_signal[1].signal_group = SPU_PROFILE_EVENT_ADDR / 100;
723         pm_signal[1].bus_word = GET_BUS_WORD(SPU_PROFILE_EVENT_ADDR_MASK_A);
724         pm_signal[1].bit = SPU_PROFILE_EVENT_ADDR % 100;
725         pm_signal[1].sub_unit = spu_evnt_phys_spu_indx;
726
727         pm_signal[2].signal_group = SPU_PROFILE_EVENT_ADDR / 100;
728         pm_signal[2].bus_word = GET_BUS_WORD(SPU_PROFILE_EVENT_ADDR_MASK_B);
729         pm_signal[2].bit = SPU_PROFILE_EVENT_ADDR % 100;
730         pm_signal[2].sub_unit = spu_evnt_phys_spu_indx;
731
732         /* Set the user selected spu event to profile on,
733          * note, only one SPU profiling event is supported
734          */
735         num_counters = 1;  /* Only support one SPU event at a time */
736         set_pm_event(0, ctr[0].event, ctr[0].unit_mask);
737
738         reset_value[0] = 0xFFFFFFFF - ctr[0].count;
739
740         /* global, used by cell_cpu_setup */
741         ctr_enabled |= 1;
742
743         /* Initialize the count for each SPU to the reset value */
744         for (i=0; i < MAX_NUMNODES * NUM_SPUS_PER_NODE; i++)
745                 spu_pm_cnt[i] = reset_value[0];
746
747         return 0;
748 }
749
750 static int cell_reg_setup_ppu(struct op_counter_config *ctr,
751                         struct op_system_config *sys, int num_ctrs)
752 {
753         /* routine is called once for all nodes */
754         int i, j, cpu;
755
756         num_counters = num_ctrs;
757
758         if (unlikely(num_ctrs > NR_PHYS_CTRS)) {
759                 printk(KERN_ERR
760                        "%s: Oprofile, number of specified events " \
761                        "exceeds number of physical counters\n",
762                        __func__);
763                 return -EIO;
764         }
765
766         set_count_mode(sys->enable_kernel, sys->enable_user);
767
768         /* Setup the thread 0 events */
769         for (i = 0; i < num_ctrs; ++i) {
770
771                 pmc_cntrl[0][i].evnts = ctr[i].event;
772                 pmc_cntrl[0][i].masks = ctr[i].unit_mask;
773                 pmc_cntrl[0][i].enabled = ctr[i].enabled;
774                 pmc_cntrl[0][i].vcntr = i;
775
776                 for_each_possible_cpu(j)
777                         per_cpu(pmc_values, j)[i] = 0;
778         }
779
780         /*
781          * Setup the thread 1 events, map the thread 0 event to the
782          * equivalent thread 1 event.
783          */
784         for (i = 0; i < num_ctrs; ++i) {
785                 if ((ctr[i].event >= 2100) && (ctr[i].event <= 2111))
786                         pmc_cntrl[1][i].evnts = ctr[i].event + 19;
787                 else if (ctr[i].event == 2203)
788                         pmc_cntrl[1][i].evnts = ctr[i].event;
789                 else if ((ctr[i].event >= 2200) && (ctr[i].event <= 2215))
790                         pmc_cntrl[1][i].evnts = ctr[i].event + 16;
791                 else
792                         pmc_cntrl[1][i].evnts = ctr[i].event;
793
794                 pmc_cntrl[1][i].masks = ctr[i].unit_mask;
795                 pmc_cntrl[1][i].enabled = ctr[i].enabled;
796                 pmc_cntrl[1][i].vcntr = i;
797         }
798
799         for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
800                 input_bus[i] = 0xff;
801
802         /*
803          * Our counters count up, and "count" refers to
804          * how much before the next interrupt, and we interrupt
805          * on overflow.  So we calculate the starting value
806          * which will give us "count" until overflow.
807          * Then we set the events on the enabled counters.
808          */
809         for (i = 0; i < num_counters; ++i) {
810                 /* start with virtual counter set 0 */
811                 if (pmc_cntrl[0][i].enabled) {
812                         /* Using 32bit counters, reset max - count */
813                         reset_value[i] = 0xFFFFFFFF - ctr[i].count;
814                         set_pm_event(i,
815                                      pmc_cntrl[0][i].evnts,
816                                      pmc_cntrl[0][i].masks);
817
818                         /* global, used by cell_cpu_setup */
819                         ctr_enabled |= (1 << i);
820                 }
821         }
822
823         /* initialize the previous counts for the virtual cntrs */
824         for_each_online_cpu(cpu)
825                 for (i = 0; i < num_counters; ++i) {
826                         per_cpu(pmc_values, cpu)[i] = reset_value[i];
827                 }
828
829         return 0;
830 }
831
832
833 /* This function is called once for all cpus combined */
834 static int cell_reg_setup(struct op_counter_config *ctr,
835                         struct op_system_config *sys, int num_ctrs)
836 {
837         int ret=0;
838         spu_cycle_reset = 0;
839
840         /* initialize the spu_arr_trace value, will be reset if
841          * doing spu event profiling.
842          */
843         pm_regs.group_control = 0;
844         pm_regs.debug_bus_control = 0;
845         pm_regs.pm_cntrl.stop_at_max = 1;
846         pm_regs.pm_cntrl.trace_mode = 0;
847         pm_regs.pm_cntrl.freeze = 1;
848         pm_regs.pm_cntrl.trace_buf_ovflw = 0;
849         pm_regs.pm_cntrl.spu_addr_trace = 0;
850
851         /*
852          * For all events except PPU CYCLEs, each node will need to make
853          * the rtas cbe-perftools call to setup and reset the debug bus.
854          * Make the token lookup call once and store it in the global
855          * variable pm_rtas_token.
856          */
857         pm_rtas_token = rtas_token("ibm,cbe-perftools");
858
859         if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
860                 printk(KERN_ERR
861                        "%s: rtas token ibm,cbe-perftools unknown\n",
862                        __func__);
863                 return -EIO;
864         }
865
866         if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
867                 profiling_mode = SPU_PROFILING_CYCLES;
868                 ret = cell_reg_setup_spu_cycles(ctr, sys, num_ctrs);
869         } else if ((ctr[0].event >= SPU_EVENT_NUM_START) &&
870                    (ctr[0].event <= SPU_EVENT_NUM_STOP)) {
871                 profiling_mode = SPU_PROFILING_EVENTS;
872                 spu_cycle_reset = ctr[0].count;
873
874                 /* for SPU event profiling, need to setup the
875                  * pm_signal array with the events to route the
876                  * SPU PC before making the FW call.  Note, only
877                  * one SPU event for profiling can be specified
878                  * at a time.
879                  */
880                 cell_reg_setup_spu_events(ctr, sys, num_ctrs);
881         } else {
882                 profiling_mode = PPU_PROFILING;
883                 ret = cell_reg_setup_ppu(ctr, sys, num_ctrs);
884         }
885
886         return ret;
887 }
888
889
890
891 /* This function is called once for each cpu */
892 static int cell_cpu_setup(struct op_counter_config *cntr)
893 {
894         u32 cpu = smp_processor_id();
895         u32 num_enabled = 0;
896         int i;
897         int ret;
898
899         /* Cycle based SPU profiling does not use the performance
900          * counters.  The trace array is configured to collect
901          * the data.
902          */
903         if (profiling_mode == SPU_PROFILING_CYCLES)
904                 return 0;
905
906         /* There is one performance monitor per processor chip (i.e. node),
907          * so we only need to perform this function once per node.
908          */
909         if (cbe_get_hw_thread_id(cpu))
910                 return 0;
911
912         /* Stop all counters */
913         cbe_disable_pm(cpu);
914         cbe_disable_pm_interrupts(cpu);
915
916         cbe_write_pm(cpu, pm_start_stop, 0);
917         cbe_write_pm(cpu, group_control, pm_regs.group_control);
918         cbe_write_pm(cpu, debug_bus_control, pm_regs.debug_bus_control);
919         write_pm_cntrl(cpu);
920
921         for (i = 0; i < num_counters; ++i) {
922                 if (ctr_enabled & (1 << i)) {
923                         pm_signal[num_enabled].cpu = cbe_cpu_to_node(cpu);
924                         num_enabled++;
925                 }
926         }
927
928         /*
929          * The pm_rtas_activate_signals will return -EIO if the FW
930          * call failed.
931          */
932         if (profiling_mode == SPU_PROFILING_EVENTS) {
933                 /* For SPU event profiling also need to setup the
934                  * pm interval timer
935                  */
936                 ret = pm_rtas_activate_signals(cbe_cpu_to_node(cpu),
937                                                num_enabled+2);
938                 /* store PC from debug bus to Trace buffer as often
939                  * as possible (every 10 cycles)
940                  */
941                 cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
942                 return ret;
943         } else
944                 return pm_rtas_activate_signals(cbe_cpu_to_node(cpu),
945                                                 num_enabled);
946 }
947
948 #define ENTRIES  303
949 #define MAXLFSR  0xFFFFFF
950
951 /* precomputed table of 24 bit LFSR values */
952 static int initial_lfsr[] = {
953  8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
954  15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716,
955  4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547,
956  3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392,
957  9341783, 1526887, 3985002, 1439429, 13923762, 7010104, 11969769, 4547026,
958  2040072, 4025602, 3437678, 7939992, 11444177, 4496094, 9803157, 10745556,
959  3671780, 4257846, 5662259, 13196905, 3237343, 12077182, 16222879, 7587769,
960  14706824, 2184640, 12591135, 10420257, 7406075, 3648978, 11042541, 15906893,
961  11914928, 4732944, 10695697, 12928164, 11980531, 4430912, 11939291, 2917017,
962  6119256, 4172004, 9373765, 8410071, 14788383, 5047459, 5474428, 1737756,
963  15967514, 13351758, 6691285, 8034329, 2856544, 14394753, 11310160, 12149558,
964  7487528, 7542781, 15668898, 12525138, 12790975, 3707933, 9106617, 1965401,
965  16219109, 12801644, 2443203, 4909502, 8762329, 3120803, 6360315, 9309720,
966  15164599, 10844842, 4456529, 6667610, 14924259, 884312, 6234963, 3326042,
967  15973422, 13919464, 5272099, 6414643, 3909029, 2764324, 5237926, 4774955,
968  10445906, 4955302, 5203726, 10798229, 11443419, 2303395, 333836, 9646934,
969  3464726, 4159182, 568492, 995747, 10318756, 13299332, 4836017, 8237783,
970  3878992, 2581665, 11394667, 5672745, 14412947, 3159169, 9094251, 16467278,
971  8671392, 15230076, 4843545, 7009238, 15504095, 1494895, 9627886, 14485051,
972  8304291, 252817, 12421642, 16085736, 4774072, 2456177, 4160695, 15409741,
973  4902868, 5793091, 13162925, 16039714, 782255, 11347835, 14884586, 366972,
974  16308990, 11913488, 13390465, 2958444, 10340278, 1177858, 1319431, 10426302,
975  2868597, 126119, 5784857, 5245324, 10903900, 16436004, 3389013, 1742384,
976  14674502, 10279218, 8536112, 10364279, 6877778, 14051163, 1025130, 6072469,
977  1988305, 8354440, 8216060, 16342977, 13112639, 3976679, 5913576, 8816697,
978  6879995, 14043764, 3339515, 9364420, 15808858, 12261651, 2141560, 5636398,
979  10345425, 10414756, 781725, 6155650, 4746914, 5078683, 7469001, 6799140,
980  10156444, 9667150, 10116470, 4133858, 2121972, 1124204, 1003577, 1611214,
981  14304602, 16221850, 13878465, 13577744, 3629235, 8772583, 10881308, 2410386,
982  7300044, 5378855, 9301235, 12755149, 4977682, 8083074, 10327581, 6395087,
983  9155434, 15501696, 7514362, 14520507, 15808945, 3244584, 4741962, 9658130,
984  14336147, 8654727, 7969093, 15759799, 14029445, 5038459, 9894848, 8659300,
985  13699287, 8834306, 10712885, 14753895, 10410465, 3373251, 309501, 9561475,
986  5526688, 14647426, 14209836, 5339224, 207299, 14069911, 8722990, 2290950,
987  3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003,
988  6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375,
989  7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426,
990  6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607
991 };
992
993 /*
994  * The hardware uses an LFSR counting sequence to determine when to capture
995  * the SPU PCs.  An LFSR sequence is like a puesdo random number sequence
996  * where each number occurs once in the sequence but the sequence is not in
997  * numerical order. The SPU PC capture is done when the LFSR sequence reaches
998  * the last value in the sequence.  Hence the user specified value N
999  * corresponds to the LFSR number that is N from the end of the sequence.
1000  *
1001  * To avoid the time to compute the LFSR, a lookup table is used.  The 24 bit
1002  * LFSR sequence is broken into four ranges.  The spacing of the precomputed
1003  * values is adjusted in each range so the error between the user specified
1004  * number (N) of events between samples and the actual number of events based
1005  * on the precomputed value will be les then about 6.2%.  Note, if the user
1006  * specifies N < 2^16, the LFSR value that is 2^16 from the end will be used.
1007  * This is to prevent the loss of samples because the trace buffer is full.
1008  *
1009  *         User specified N                  Step between          Index in
1010  *                                       precomputed values      precomputed
1011  *                                                                  table
1012  * 0                to  2^16-1                  ----                  0
1013  * 2^16     to  2^16+2^19-1             2^12                1 to 128
1014  * 2^16+2^19        to  2^16+2^19+2^22-1        2^15              129 to 256
1015  * 2^16+2^19+2^22  to   2^24-1                  2^18              257 to 302
1016  *
1017  *
1018  * For example, the LFSR values in the second range are computed for 2^16,
1019  * 2^16+2^12, ... , 2^19-2^16, 2^19 and stored in the table at indicies
1020  * 1, 2,..., 127, 128.
1021  *
1022  * The 24 bit LFSR value for the nth number in the sequence can be
1023  * calculated using the following code:
1024  *
1025  * #define size 24
1026  * int calculate_lfsr(int n)
1027  * {
1028  *      int i;
1029  *      unsigned int newlfsr0;
1030  *      unsigned int lfsr = 0xFFFFFF;
1031  *      unsigned int howmany = n;
1032  *
1033  *      for (i = 2; i < howmany + 2; i++) {
1034  *              newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
1035  *              ((lfsr >> (size - 1 - 1)) & 1) ^
1036  *              (((lfsr >> (size - 1 - 6)) & 1) ^
1037  *              ((lfsr >> (size - 1 - 23)) & 1)));
1038  *
1039  *              lfsr >>= 1;
1040  *              lfsr = lfsr | (newlfsr0 << (size - 1));
1041  *      }
1042  *      return lfsr;
1043  * }
1044  */
1045
1046 #define V2_16  (0x1 << 16)
1047 #define V2_19  (0x1 << 19)
1048 #define V2_22  (0x1 << 22)
1049
1050 static int calculate_lfsr(int n)
1051 {
1052         /*
1053          * The ranges and steps are in powers of 2 so the calculations
1054          * can be done using shifts rather then divide.
1055          */
1056         int index;
1057
1058         if ((n >> 16) == 0)
1059                 index = 0;
1060         else if (((n - V2_16) >> 19) == 0)
1061                 index = ((n - V2_16) >> 12) + 1;
1062         else if (((n - V2_16 - V2_19) >> 22) == 0)
1063                 index = ((n - V2_16 - V2_19) >> 15 ) + 1 + 128;
1064         else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0)
1065                 index = ((n - V2_16 - V2_19 - V2_22) >> 18 ) + 1 + 256;
1066         else
1067                 index = ENTRIES-1;
1068
1069         /* make sure index is valid */
1070         if ((index >= ENTRIES) || (index < 0))
1071                 index = ENTRIES-1;
1072
1073         return initial_lfsr[index];
1074 }
1075
1076 static int pm_rtas_activate_spu_profiling(u32 node)
1077 {
1078         int ret, i;
1079         struct pm_signal pm_signal_local[NUM_SPUS_PER_NODE];
1080
1081         /*
1082          * Set up the rtas call to configure the debug bus to
1083          * route the SPU PCs.  Setup the pm_signal for each SPU
1084          */
1085         for (i = 0; i < ARRAY_SIZE(pm_signal_local); i++) {
1086                 pm_signal_local[i].cpu = node;
1087                 pm_signal_local[i].signal_group = 41;
1088                 /* spu i on word (i/2) */
1089                 pm_signal_local[i].bus_word = 1 << i / 2;
1090                 /* spu i */
1091                 pm_signal_local[i].sub_unit = i;
1092                 pm_signal_local[i].bit = 63;
1093         }
1094
1095         ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE,
1096                                      PASSTHRU_ENABLE, pm_signal_local,
1097                                      (ARRAY_SIZE(pm_signal_local)
1098                                       * sizeof(struct pm_signal)));
1099
1100         if (unlikely(ret)) {
1101                 printk(KERN_WARNING "%s: rtas returned: %d\n",
1102                        __func__, ret);
1103                 return -EIO;
1104         }
1105
1106         return 0;
1107 }
1108
1109 #ifdef CONFIG_CPU_FREQ
1110 static int
1111 oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
1112 {
1113         int ret = 0;
1114         struct cpufreq_freqs *frq = data;
1115         if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
1116             (val == CPUFREQ_POSTCHANGE && frq->old > frq->new))
1117                 set_spu_profiling_frequency(frq->new, spu_cycle_reset);
1118         return ret;
1119 }
1120
1121 static struct notifier_block cpu_freq_notifier_block = {
1122         .notifier_call  = oprof_cpufreq_notify
1123 };
1124 #endif
1125
1126 /*
1127  * Note the generic OProfile stop calls do not support returning
1128  * an error on stop.  Hence, will not return an error if the FW
1129  * calls fail on stop.  Failure to reset the debug bus is not an issue.
1130  * Failure to disable the SPU profiling is not an issue.  The FW calls
1131  * to enable the performance counters and debug bus will work even if
1132  * the hardware was not cleanly reset.
1133  */
1134 static void cell_global_stop_spu_cycles(void)
1135 {
1136         int subfunc, rtn_value;
1137         unsigned int lfsr_value;
1138         int cpu;
1139
1140         oprofile_running = 0;
1141         smp_wmb();
1142
1143 #ifdef CONFIG_CPU_FREQ
1144         cpufreq_unregister_notifier(&cpu_freq_notifier_block,
1145                                     CPUFREQ_TRANSITION_NOTIFIER);
1146 #endif
1147
1148         for_each_online_cpu(cpu) {
1149                 if (cbe_get_hw_thread_id(cpu))
1150                         continue;
1151
1152                 subfunc = 3;    /*
1153                                  * 2 - activate SPU tracing,
1154                                  * 3 - deactivate
1155                                  */
1156                 lfsr_value = 0x8f100000;
1157
1158                 rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL,
1159                                       subfunc, cbe_cpu_to_node(cpu),
1160                                       lfsr_value);
1161
1162                 if (unlikely(rtn_value != 0)) {
1163                         printk(KERN_ERR
1164                                "%s: rtas call ibm,cbe-spu-perftools " \
1165                                "failed, return = %d\n",
1166                                __func__, rtn_value);
1167                 }
1168
1169                 /* Deactivate the signals */
1170                 pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
1171         }
1172
1173         stop_spu_profiling_cycles();
1174 }
1175
1176 static void cell_global_stop_spu_events(void)
1177 {
1178         int cpu;
1179         oprofile_running = 0;
1180
1181         stop_spu_profiling_events();
1182         smp_wmb();
1183
1184         for_each_online_cpu(cpu) {
1185                 if (cbe_get_hw_thread_id(cpu))
1186                         continue;
1187
1188                 cbe_sync_irq(cbe_cpu_to_node(cpu));
1189                 /* Stop the counters */
1190                 cbe_disable_pm(cpu);
1191                 cbe_write_pm07_control(cpu, 0, 0);
1192
1193                 /* Deactivate the signals */
1194                 pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
1195
1196                 /* Deactivate interrupts */
1197                 cbe_disable_pm_interrupts(cpu);
1198         }
1199         del_timer_sync(&timer_spu_event_swap);
1200 }
1201
1202 static void cell_global_stop_ppu(void)
1203 {
1204         int cpu;
1205
1206         /*
1207          * This routine will be called once for the system.
1208          * There is one performance monitor per node, so we
1209          * only need to perform this function once per node.
1210          */
1211         del_timer_sync(&timer_virt_cntr);
1212         oprofile_running = 0;
1213         smp_wmb();
1214
1215         for_each_online_cpu(cpu) {
1216                 if (cbe_get_hw_thread_id(cpu))
1217                         continue;
1218
1219                 cbe_sync_irq(cbe_cpu_to_node(cpu));
1220                 /* Stop the counters */
1221                 cbe_disable_pm(cpu);
1222
1223                 /* Deactivate the signals */
1224                 pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
1225
1226                 /* Deactivate interrupts */
1227                 cbe_disable_pm_interrupts(cpu);
1228         }
1229 }
1230
1231 static void cell_global_stop(void)
1232 {
1233         if (profiling_mode == PPU_PROFILING)
1234                 cell_global_stop_ppu();
1235         else if (profiling_mode == SPU_PROFILING_EVENTS)
1236                 cell_global_stop_spu_events();
1237         else
1238                 cell_global_stop_spu_cycles();
1239 }
1240
1241 static int cell_global_start_spu_cycles(struct op_counter_config *ctr)
1242 {
1243         int subfunc;
1244         unsigned int lfsr_value;
1245         int cpu;
1246         int ret;
1247         int rtas_error;
1248         unsigned int cpu_khzfreq = 0;
1249
1250         /* The SPU profiling uses time-based profiling based on
1251          * cpu frequency, so if configured with the CPU_FREQ
1252          * option, we should detect frequency changes and react
1253          * accordingly.
1254          */
1255 #ifdef CONFIG_CPU_FREQ
1256         ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
1257                                         CPUFREQ_TRANSITION_NOTIFIER);
1258         if (ret < 0)
1259                 /* this is not a fatal error */
1260                 printk(KERN_ERR "CPU freq change registration failed: %d\n",
1261                        ret);
1262
1263         else
1264                 cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
1265 #endif
1266
1267         set_spu_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
1268
1269         for_each_online_cpu(cpu) {
1270                 if (cbe_get_hw_thread_id(cpu))
1271                         continue;
1272
1273                 /*
1274                  * Setup SPU cycle-based profiling.
1275                  * Set perf_mon_control bit 0 to a zero before
1276                  * enabling spu collection hardware.
1277                  */
1278                 cbe_write_pm(cpu, pm_control, 0);
1279
1280                 if (spu_cycle_reset > MAX_SPU_COUNT)
1281                         /* use largest possible value */
1282                         lfsr_value = calculate_lfsr(MAX_SPU_COUNT-1);
1283                 else
1284                         lfsr_value = calculate_lfsr(spu_cycle_reset);
1285
1286                 /* must use a non zero value. Zero disables data collection. */
1287                 if (lfsr_value == 0)
1288                         lfsr_value = calculate_lfsr(1);
1289
1290                 lfsr_value = lfsr_value << 8; /* shift lfsr to correct
1291                                                 * register location
1292                                                 */
1293
1294                 /* debug bus setup */
1295                 ret = pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
1296
1297                 if (unlikely(ret)) {
1298                         rtas_error = ret;
1299                         goto out;
1300                 }
1301
1302
1303                 subfunc = 2;    /* 2 - activate SPU tracing, 3 - deactivate */
1304
1305                 /* start profiling */
1306                 ret = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
1307                                 cbe_cpu_to_node(cpu), lfsr_value);
1308
1309                 if (unlikely(ret != 0)) {
1310                         printk(KERN_ERR
1311                                "%s: rtas call ibm,cbe-spu-perftools failed, " \
1312                                "return = %d\n", __func__, ret);
1313                         rtas_error = -EIO;
1314                         goto out;
1315                 }
1316         }
1317
1318         rtas_error = start_spu_profiling_cycles(spu_cycle_reset);
1319         if (rtas_error)
1320                 goto out_stop;
1321
1322         oprofile_running = 1;
1323         return 0;
1324
1325 out_stop:
1326         cell_global_stop_spu_cycles();  /* clean up the PMU/debug bus */
1327 out:
1328         return rtas_error;
1329 }
1330
1331 static int cell_global_start_spu_events(struct op_counter_config *ctr)
1332 {
1333         int cpu;
1334         u32 interrupt_mask = 0;
1335         int rtn = 0;
1336
1337         hdw_thread = 0;
1338
1339         /* spu event profiling, uses the performance counters to generate
1340          * an interrupt.  The hardware is setup to store the SPU program
1341          * counter into the trace array.  The occurrence mode is used to
1342          * enable storing data to the trace buffer.  The bits are set
1343          * to send/store the SPU address in the trace buffer.  The debug
1344          * bus must be setup to route the SPU program counter onto the
1345          * debug bus.  The occurrence data in the trace buffer is not used.
1346          */
1347
1348         /* This routine gets called once for the system.
1349          * There is one performance monitor per node, so we
1350          * only need to perform this function once per node.
1351          */
1352
1353         for_each_online_cpu(cpu) {
1354                 if (cbe_get_hw_thread_id(cpu))
1355                         continue;
1356
1357                 /*
1358                  * Setup SPU event-based profiling.
1359                  * Set perf_mon_control bit 0 to a zero before
1360                  * enabling spu collection hardware.
1361                  *
1362                  * Only support one SPU event on one SPU per node.
1363                  */
1364                 if (ctr_enabled & 1) {
1365                         cbe_write_ctr(cpu, 0, reset_value[0]);
1366                         enable_ctr(cpu, 0, pm_regs.pm07_cntrl);
1367                         interrupt_mask |=
1368                                 CBE_PM_CTR_OVERFLOW_INTR(0);
1369                 } else {
1370                         /* Disable counter */
1371                         cbe_write_pm07_control(cpu, 0, 0);
1372                 }
1373
1374                 cbe_get_and_clear_pm_interrupts(cpu);
1375                 cbe_enable_pm_interrupts(cpu, hdw_thread, interrupt_mask);
1376                 cbe_enable_pm(cpu);
1377
1378                 /* clear the trace buffer */
1379                 cbe_write_pm(cpu, trace_address, 0);
1380         }
1381
1382         /* Start the timer to time slice collecting the event profile
1383          * on each of the SPUs.  Note, can collect profile on one SPU
1384          * per node at a time.
1385          */
1386         start_spu_event_swap();
1387         start_spu_profiling_events();
1388         oprofile_running = 1;
1389         smp_wmb();
1390
1391         return rtn;
1392 }
1393
1394 static int cell_global_start_ppu(struct op_counter_config *ctr)
1395 {
1396         u32 cpu, i;
1397         u32 interrupt_mask = 0;
1398
1399         /* This routine gets called once for the system.
1400          * There is one performance monitor per node, so we
1401          * only need to perform this function once per node.
1402          */
1403         for_each_online_cpu(cpu) {
1404                 if (cbe_get_hw_thread_id(cpu))
1405                         continue;
1406
1407                 interrupt_mask = 0;
1408
1409                 for (i = 0; i < num_counters; ++i) {
1410                         if (ctr_enabled & (1 << i)) {
1411                                 cbe_write_ctr(cpu, i, reset_value[i]);
1412                                 enable_ctr(cpu, i, pm_regs.pm07_cntrl);
1413                                 interrupt_mask |= CBE_PM_CTR_OVERFLOW_INTR(i);
1414                         } else {
1415                                 /* Disable counter */
1416                                 cbe_write_pm07_control(cpu, i, 0);
1417                         }
1418                 }
1419
1420                 cbe_get_and_clear_pm_interrupts(cpu);
1421                 cbe_enable_pm_interrupts(cpu, hdw_thread, interrupt_mask);
1422                 cbe_enable_pm(cpu);
1423         }
1424
1425         virt_cntr_inter_mask = interrupt_mask;
1426         oprofile_running = 1;
1427         smp_wmb();
1428
1429         /*
1430          * NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
1431          * executed which manipulates the PMU.  We start the "virtual counter"
1432          * here so that we do not need to synchronize access to the PMU in
1433          * the above for-loop.
1434          */
1435         start_virt_cntrs();
1436
1437         return 0;
1438 }
1439
1440 static int cell_global_start(struct op_counter_config *ctr)
1441 {
1442         if (profiling_mode == SPU_PROFILING_CYCLES)
1443                 return cell_global_start_spu_cycles(ctr);
1444         else if (profiling_mode == SPU_PROFILING_EVENTS)
1445                 return cell_global_start_spu_events(ctr);
1446         else
1447                 return cell_global_start_ppu(ctr);
1448 }
1449
1450
1451 /* The SPU interrupt handler
1452  *
1453  * SPU event profiling works as follows:
1454  * The pm_signal[0] holds the one SPU event to be measured.  It is routed on
1455  * the debug bus using word 0 or 1.  The value of pm_signal[1] and
1456  * pm_signal[2] contain the necessary events to route the SPU program
1457  * counter for the selected SPU onto the debug bus using words 2 and 3.
1458  * The pm_interval register is setup to write the SPU PC value into the
1459  * trace buffer at the maximum rate possible.  The trace buffer is configured
1460  * to store the PCs, wrapping when it is full.  The performance counter is
1461  * initialized to the max hardware count minus the number of events, N, between
1462  * samples.  Once the N events have occurred, a HW counter overflow occurs
1463  * causing the generation of a HW counter interrupt which also stops the
1464  * writing of the SPU PC values to the trace buffer.  Hence the last PC
1465  * written to the trace buffer is the SPU PC that we want.  Unfortunately,
1466  * we have to read from the beginning of the trace buffer to get to the
1467  * last value written.  We just hope the PPU has nothing better to do then
1468  * service this interrupt. The PC for the specific SPU being profiled is
1469  * extracted from the trace buffer processed and stored.  The trace buffer
1470  * is cleared, interrupts are cleared, the counter is reset to max - N.
1471  * A kernel timer is used to periodically call the routine spu_evnt_swap()
1472  * to switch to the next physical SPU in the node to profile in round robbin
1473  * order.  This way data is collected for all SPUs on the node. It does mean
1474  * that we need to use a relatively small value of N to ensure enough samples
1475  * on each SPU are collected each SPU is being profiled 1/8 of the time.
1476  * It may also be necessary to use a longer sample collection period.
1477  */
1478 static void cell_handle_interrupt_spu(struct pt_regs *regs,
1479                                       struct op_counter_config *ctr)
1480 {
1481         u32 cpu, cpu_tmp;
1482         u64 trace_entry;
1483         u32 interrupt_mask;
1484         u64 trace_buffer[2];
1485         u64 last_trace_buffer;
1486         u32 sample;
1487         u32 trace_addr;
1488         unsigned long sample_array_lock_flags;
1489         int spu_num;
1490         unsigned long flags;
1491
1492         /* Make sure spu event interrupt handler and spu event swap
1493          * don't access the counters simultaneously.
1494          */
1495         cpu = smp_processor_id();
1496         spin_lock_irqsave(&cntr_lock, flags);
1497
1498         cpu_tmp = cpu;
1499         cbe_disable_pm(cpu);
1500
1501         interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
1502
1503         sample = 0xABCDEF;
1504         trace_entry = 0xfedcba;
1505         last_trace_buffer = 0xdeadbeaf;
1506
1507         if ((oprofile_running == 1) && (interrupt_mask != 0)) {
1508                 /* disable writes to trace buff */
1509                 cbe_write_pm(cpu, pm_interval, 0);
1510
1511                 /* only have one perf cntr being used, cntr 0 */
1512                 if ((interrupt_mask & CBE_PM_CTR_OVERFLOW_INTR(0))
1513                     && ctr[0].enabled)
1514                         /* The SPU PC values will be read
1515                          * from the trace buffer, reset counter
1516                          */
1517
1518                         cbe_write_ctr(cpu, 0, reset_value[0]);
1519
1520                 trace_addr = cbe_read_pm(cpu, trace_address);
1521
1522                 while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) {
1523                         /* There is data in the trace buffer to process
1524                          * Read the buffer until you get to the last
1525                          * entry.  This is the value we want.
1526                          */
1527
1528                         cbe_read_trace_buffer(cpu, trace_buffer);
1529                         trace_addr = cbe_read_pm(cpu, trace_address);
1530                 }
1531
1532                 /* SPU Address 16 bit count format for 128 bit
1533                  * HW trace buffer is used for the SPU PC storage
1534                  *    HDR bits          0:15
1535                  *    SPU Addr 0 bits   16:31
1536                  *    SPU Addr 1 bits   32:47
1537                  *    unused bits       48:127
1538                  *
1539                  * HDR: bit4 = 1 SPU Address 0 valid
1540                  * HDR: bit5 = 1 SPU Address 1 valid
1541                  *  - unfortunately, the valid bits don't seem to work
1542                  *
1543                  * Note trace_buffer[0] holds bits 0:63 of the HW
1544                  * trace buffer, trace_buffer[1] holds bits 64:127
1545                  */
1546
1547                 trace_entry = trace_buffer[0]
1548                         & 0x00000000FFFF0000;
1549
1550                 /* only top 16 of the 18 bit SPU PC address
1551                  * is stored in trace buffer, hence shift right
1552                  * by 16 -2 bits */
1553                 sample = trace_entry >> 14;
1554                 last_trace_buffer = trace_buffer[0];
1555
1556                 spu_num = spu_evnt_phys_spu_indx
1557                         + (cbe_cpu_to_node(cpu) * NUM_SPUS_PER_NODE);
1558
1559                 /* make sure only one process at a time is calling
1560                  * spu_sync_buffer()
1561                  */
1562                 spin_lock_irqsave(&oprof_spu_smpl_arry_lck,
1563                                   sample_array_lock_flags);
1564                 spu_sync_buffer(spu_num, &sample, 1);
1565                 spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck,
1566                                        sample_array_lock_flags);
1567
1568                 smp_wmb();    /* insure spu event buffer updates are written
1569                                * don't want events intermingled... */
1570
1571                 /* The counters were frozen by the interrupt.
1572                  * Reenable the interrupt and restart the counters.
1573                  */
1574                 cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
1575                 cbe_enable_pm_interrupts(cpu, hdw_thread,
1576                                          virt_cntr_inter_mask);
1577
1578                 /* clear the trace buffer, re-enable writes to trace buff */
1579                 cbe_write_pm(cpu, trace_address, 0);
1580                 cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
1581
1582                 /* The writes to the various performance counters only writes
1583                  * to a latch.  The new values (interrupt setting bits, reset
1584                  * counter value etc.) are not copied to the actual registers
1585                  * until the performance monitor is enabled.  In order to get
1586                  * this to work as desired, the performance monitor needs to
1587                  * be disabled while writing to the latches.  This is a
1588                  * HW design issue.
1589                  */
1590                 write_pm_cntrl(cpu);
1591                 cbe_enable_pm(cpu);
1592         }
1593         spin_unlock_irqrestore(&cntr_lock, flags);
1594 }
1595
1596 static void cell_handle_interrupt_ppu(struct pt_regs *regs,
1597                                       struct op_counter_config *ctr)
1598 {
1599         u32 cpu;
1600         u64 pc;
1601         int is_kernel;
1602         unsigned long flags = 0;
1603         u32 interrupt_mask;
1604         int i;
1605
1606         cpu = smp_processor_id();
1607
1608         /*
1609          * Need to make sure the interrupt handler and the virt counter
1610          * routine are not running at the same time. See the
1611          * cell_virtual_cntr() routine for additional comments.
1612          */
1613         spin_lock_irqsave(&cntr_lock, flags);
1614
1615         /*
1616          * Need to disable and reenable the performance counters
1617          * to get the desired behavior from the hardware.  This
1618          * is hardware specific.
1619          */
1620
1621         cbe_disable_pm(cpu);
1622
1623         interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
1624
1625         /*
1626          * If the interrupt mask has been cleared, then the virt cntr
1627          * has cleared the interrupt.  When the thread that generated
1628          * the interrupt is restored, the data count will be restored to
1629          * 0xffffff0 to cause the interrupt to be regenerated.
1630          */
1631
1632         if ((oprofile_running == 1) && (interrupt_mask != 0)) {
1633                 pc = regs->nip;
1634                 is_kernel = is_kernel_addr(pc);
1635
1636                 for (i = 0; i < num_counters; ++i) {
1637                         if ((interrupt_mask & CBE_PM_CTR_OVERFLOW_INTR(i))
1638                             && ctr[i].enabled) {
1639                                 oprofile_add_ext_sample(pc, regs, i, is_kernel);
1640                                 cbe_write_ctr(cpu, i, reset_value[i]);
1641                         }
1642                 }
1643
1644                 /*
1645                  * The counters were frozen by the interrupt.
1646                  * Reenable the interrupt and restart the counters.
1647                  * If there was a race between the interrupt handler and
1648                  * the virtual counter routine.  The virtual counter
1649                  * routine may have cleared the interrupts.  Hence must
1650                  * use the virt_cntr_inter_mask to re-enable the interrupts.
1651                  */
1652                 cbe_enable_pm_interrupts(cpu, hdw_thread,
1653                                          virt_cntr_inter_mask);
1654
1655                 /*
1656                  * The writes to the various performance counters only writes
1657                  * to a latch.  The new values (interrupt setting bits, reset
1658                  * counter value etc.) are not copied to the actual registers
1659                  * until the performance monitor is enabled.  In order to get
1660                  * this to work as desired, the performance monitor needs to
1661                  * be disabled while writing to the latches.  This is a
1662                  * HW design issue.
1663                  */
1664                 cbe_enable_pm(cpu);
1665         }
1666         spin_unlock_irqrestore(&cntr_lock, flags);
1667 }
1668
1669 static void cell_handle_interrupt(struct pt_regs *regs,
1670                                   struct op_counter_config *ctr)
1671 {
1672         if (profiling_mode == PPU_PROFILING)
1673                 cell_handle_interrupt_ppu(regs, ctr);
1674         else
1675                 cell_handle_interrupt_spu(regs, ctr);
1676 }
1677
1678 /*
1679  * This function is called from the generic OProfile
1680  * driver.  When profiling PPUs, we need to do the
1681  * generic sync start; otherwise, do spu_sync_start.
1682  */
1683 static int cell_sync_start(void)
1684 {
1685         if ((profiling_mode == SPU_PROFILING_CYCLES) ||
1686             (profiling_mode == SPU_PROFILING_EVENTS))
1687                 return spu_sync_start();
1688         else
1689                 return DO_GENERIC_SYNC;
1690 }
1691
1692 static int cell_sync_stop(void)
1693 {
1694         if ((profiling_mode == SPU_PROFILING_CYCLES) ||
1695             (profiling_mode == SPU_PROFILING_EVENTS))
1696                 return spu_sync_stop();
1697         else
1698                 return 1;
1699 }
1700
1701 struct op_powerpc_model op_model_cell = {
1702         .reg_setup = cell_reg_setup,
1703         .cpu_setup = cell_cpu_setup,
1704         .global_start = cell_global_start,
1705         .global_stop = cell_global_stop,
1706         .sync_start = cell_sync_start,
1707         .sync_stop = cell_sync_stop,
1708         .handle_interrupt = cell_handle_interrupt,
1709 };