ring-buffer: Revert "ring-buffer: Get timestamp after event is allocated"
authorSteven Rostedt (Red Hat) <rostedt@goodmis.org>
Thu, 3 Sep 2015 12:57:12 +0000 (08:57 -0400)
committerSteven Rostedt <rostedt@goodmis.org>
Thu, 3 Sep 2015 12:57:12 +0000 (08:57 -0400)
The commit a4543a2fa9ef31 "ring-buffer: Get timestamp after event is
allocated" is needed for some future work. But after adding it, there is a
race somewhere that causes the saved timestamp to have a slight shift, and
get ahead of the actual timestamp and make it look like time goes backwards.

I'm still looking into why this happens, but in the mean time, this is
holding up other work to get in. I'm reverting the change for now (which
makes the problem go away), and will add it back after I know what is wrong
and fix it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
kernel/trace/ring_buffer.c

index 1cce0fbf92cea04e394a6a8a0990cbf729831b1a..fc347f8b1bca24debb823cdeca09424368c16e2e 100644 (file)
@@ -2141,8 +2141,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        local_sub(length, &tail_page->write);
 }
 
-static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
-
 /*
  * This is the slow path, force gcc not to inline it.
  */
@@ -2157,16 +2155,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
        int ret;
        u64 ts;
 
-       /*
-        * If the event had a timestamp attached to it, remove it.
-        * The first event on a page (nested or not) always uses
-        * the full timestamp of the new page.
-        */
-       if (info->add_timestamp) {
-               info->add_timestamp = 0;
-               info->length -= RB_LEN_TIME_EXTEND;
-       }
-
        next_page = tail_page;
 
        rb_inc_page(cpu_buffer, &next_page);
@@ -2253,11 +2241,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
        rb_reset_tail(cpu_buffer, tail, info);
 
-       /* Commit what we have for now to update timestamps */
-       rb_end_commit(cpu_buffer);
-       /* rb_end_commit() decs committing */
-       local_inc(&cpu_buffer->committing);
-
        /* fail and let the caller try again */
        return ERR_PTR(-EAGAIN);
 
@@ -2287,6 +2270,9 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
        return skip_time_extend(event);
 }
 
+static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+                                    struct ring_buffer_event *event);
+
 /**
  * rb_update_event - update event type and data
  * @event: the event to update
@@ -2298,7 +2284,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
  * and with this, we can determine what to place into the
  * data field.
  */
-static void __always_inline
+static void
 rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
                struct ring_buffer_event *event,
                struct rb_event_info *info)
@@ -2306,6 +2292,10 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
        unsigned length = info->length;
        u64 delta = info->delta;
 
+       /* Only a commit updates the timestamp */
+       if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+               delta = 0;
+
        /*
         * If we need to add a timestamp, then we
         * add it to the start of the resevered space.
@@ -2685,13 +2675,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 static noinline void
 rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
-                   struct ring_buffer_event *event,
                    struct rb_event_info *info)
 {
-       struct ring_buffer_event *padding;
-       int length;
-       int size;
-
        WARN_ONCE(info->delta > (1ULL << 59),
                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
                  (unsigned long long)info->delta,
@@ -2701,61 +2686,7 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
                  "If you just came from a suspend/resume,\n"
                  "please switch to the trace global clock:\n"
                  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-
-       /*
-        * Discarding this event to add a timestamp in front, but
-        * we still need to update the length of it to perform the discard.
-        */
-       rb_update_event(cpu_buffer, event, info);
-
-       if (rb_try_to_discard(cpu_buffer, event)) {
-               info->add_timestamp = 1;
-               /*
-                * The time delta since the last event is too big to
-                * hold in the time field of the event, then we append a
-                * TIME EXTEND event ahead of the data event.
-                */
-               info->length += RB_LEN_TIME_EXTEND;
-               return;
-       }
-
-       /*
-        * Humpf! An event came in after this one, and because it is not a
-        * commit, it will have a delta of zero, thus, it will take on
-        * the timestamp of the previous commit, which happened a long time
-        * ago (we need to add a timestamp, remember?).
-        * We need to add the timestamp here. A timestamp is a fixed size
-        * of 8 bytes. That means the rest of the event needs to be
-        * padding.
-        */
-       size = info->length - RB_LEN_TIME_EXTEND;
-
-       /* The padding will have a delta of 1 */
-       if (size)
-               info->delta--;
-
-       padding = rb_add_time_stamp(event, info->delta);
-
-       if (size) {
-               length = info->length;
-               info->delta = 0;
-               info->length = size;
-               rb_update_event(cpu_buffer, padding, info);
-
-               rb_event_discard(padding);
-
-               /* Still visible, need to update write_stamp */
-               rb_update_write_stamp(cpu_buffer, event);
-
-               /* Still need to commit the padding. */
-               rb_end_commit(cpu_buffer);
-
-               /* rb_end_commit() decs committing */
-               local_inc(&cpu_buffer->committing);
-
-               /* The next iteration still uses the original length */
-               info->length = length;
-       }
+       info->add_timestamp = 1;
 }
 
 static struct ring_buffer_event *
@@ -2765,7 +2696,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        struct ring_buffer_event *event;
        struct buffer_page *tail_page;
        unsigned long tail, write;
-       bool is_commit;
+
+       /*
+        * If the time delta since the last event is too big to
+        * hold in the time field of the event, then we append a
+        * TIME EXTEND event ahead of the data event.
+        */
+       if (unlikely(info->add_timestamp))
+               info->length += RB_LEN_TIME_EXTEND;
 
        tail_page = info->tail_page = cpu_buffer->tail_page;
        write = local_add_return(info->length, &tail_page->write);
@@ -2774,43 +2712,32 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        write &= RB_WRITE_MASK;
        tail = write - info->length;
 
-       /* See if we shot pass the end of this buffer page */
-       if (unlikely(write > BUF_PAGE_SIZE))
-               return rb_move_tail(cpu_buffer, tail, info);
-
-       /* We reserved something on the buffer */
-       event = __rb_page_index(tail_page, tail);
-
        /*
         * If this is the first commit on the page, then it has the same
-        * timestamp as the page itself, otherwise we need to figure out
-        * the delta.
+        * timestamp as the page itself.
         */
-       info->ts = rb_time_stamp(cpu_buffer->buffer);
-       is_commit = rb_event_is_commit(cpu_buffer, event);
-
-       /* Commits are special (non nested events) */
-       info->delta = is_commit ? info->ts - cpu_buffer->write_stamp : 0;
-
-       if (!tail) {
-               /*
-                * If this is the first commit on the page, set the
-                * page to its timestamp.
-                */
-               tail_page->page->time_stamp = info->ts;
+       if (!tail)
                info->delta = 0;
 
-       } else if (unlikely(test_time_stamp(info->delta)) &&
-                  !info->add_timestamp) {
-               rb_handle_timestamp(cpu_buffer, event, info);
-               return ERR_PTR(-EAGAIN);
-       }
+       /* See if we shot pass the end of this buffer page */
+       if (unlikely(write > BUF_PAGE_SIZE))
+               return rb_move_tail(cpu_buffer, tail, info);
 
+       /* We reserved something on the buffer */
+
+       event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
        rb_update_event(cpu_buffer, event, info);
 
        local_inc(&tail_page->entries);
 
+       /*
+        * If this is the first commit on the page, then update
+        * its timestamp.
+        */
+       if (!tail)
+               tail_page->page->time_stamp = info->ts;
+
        /* account for these added bytes */
        local_add(info->length, &cpu_buffer->entries_bytes);
 
@@ -2825,6 +2752,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
        struct ring_buffer_event *event;
        struct rb_event_info info;
        int nr_loops = 0;
+       u64 diff;
 
        rb_start_commit(cpu_buffer);
 
@@ -2842,9 +2770,12 @@ rb_reserve_next_event(struct ring_buffer *buffer,
                return NULL;
        }
 #endif
+
        info.length = rb_calculate_event_length(length);
-       info.add_timestamp = 0;
  again:
+       info.add_timestamp = 0;
+       info.delta = 0;
+
        /*
         * We allow for interrupts to reenter here and do a trace.
         * If one does, it will cause this original code to loop
@@ -2857,6 +2788,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
                goto out_fail;
 
+       info.ts = rb_time_stamp(cpu_buffer->buffer);
+       diff = info.ts - cpu_buffer->write_stamp;
+
+       /* make sure this diff is calculated here */
+       barrier();
+
+       /* Did the write stamp get updated already? */
+       if (likely(info.ts >= cpu_buffer->write_stamp)) {
+               info.delta = diff;
+               if (unlikely(test_time_stamp(info.delta)))
+                       rb_handle_timestamp(cpu_buffer, &info);
+       }
+
        event = __rb_reserve_next(cpu_buffer, &info);
 
        if (unlikely(PTR_ERR(event) == -EAGAIN))