Documentation/RCU/listRCU.rst

   1 .. _list_rcu_doc:
   2
   3 Using RCU to Protect Read-Mostly Linked Lists
   4 =============================================
   5
   6 One of the best applications of RCU is to protect read-mostly linked lists
   7 ("struct list_head" in list.h).  One big advantage of this approach
   8 is that all of the required memory barriers are included for you in
   9 the list macros.  This document describes several applications of RCU,
  10 with the best fits first.
  11
  12 Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates
  13 ----------------------------------------------------------------------
  14
  15 The best applications are cases where, if reader-writer locking were
  16 used, the read-side lock would be dropped before taking any action
  17 based on the results of the search.  The most celebrated example is
  18 the routing table.  Because the routing table is tracking the state of
  19 equipment outside of the computer, it will at times contain stale data.
  20 Therefore, once the route has been computed, there is no need to hold
  21 the routing table static during transmission of the packet.  After all,
  22 you can hold the routing table static all you want, but that won't keep
  23 the external Internet from changing, and it is the state of the external
  24 Internet that really matters.  In addition, routing entries are typically
  25 added or deleted, rather than being modified in place.
  26
  27 A straightforward example of this use of RCU may be found in the
  28 system-call auditing support.  For example, a reader-writer locked
  29 implementation of audit_filter_task() might be as follows::
  30
  31         static enum audit_state audit_filter_task(struct task_struct *tsk)
  32         {
  33                 struct audit_entry *e;
  34                 enum audit_state   state;
  35
  36                 read_lock(&auditsc_lock);
  37                 /* Note: audit_netlink_sem held by caller. */
  38                 list_for_each_entry(e, &audit_tsklist, list) {
  39                         if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
  40                                 read_unlock(&auditsc_lock);
  41                                 return state;
  42                         }
  43                 }
  44                 read_unlock(&auditsc_lock);
  45                 return AUDIT_BUILD_CONTEXT;
  46         }
  47
  48 Here the list is searched under the lock, but the lock is dropped before
  49 the corresponding value is returned.  By the time that this value is acted
  50 on, the list may well have been modified.  This makes sense, since if
  51 you are turning auditing off, it is OK to audit a few extra system calls.
  52
  53 This means that RCU can be easily applied to the read side, as follows::
  54
  55         static enum audit_state audit_filter_task(struct task_struct *tsk)
  56         {
  57                 struct audit_entry *e;
  58                 enum audit_state   state;
  59
  60                 rcu_read_lock();
  61                 /* Note: audit_netlink_sem held by caller. */
  62                 list_for_each_entry_rcu(e, &audit_tsklist, list) {
  63                         if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
  64                                 rcu_read_unlock();
  65                                 return state;
  66                         }
  67                 }
  68                 rcu_read_unlock();
  69                 return AUDIT_BUILD_CONTEXT;
  70         }
  71
  72 The read_lock() and read_unlock() calls have become rcu_read_lock()
  73 and rcu_read_unlock(), respectively, and the list_for_each_entry() has
  74 become list_for_each_entry_rcu().  The _rcu() list-traversal primitives
  75 insert the read-side memory barriers that are required on DEC Alpha CPUs.
  76
  77 The changes to the update side are also straightforward.  A reader-writer
  78 lock might be used as follows for deletion and insertion::
  79
  80         static inline int audit_del_rule(struct audit_rule *rule,
  81                                          struct list_head *list)
  82         {
  83                 struct audit_entry  *e;
  84
  85                 write_lock(&auditsc_lock);
  86                 list_for_each_entry(e, list, list) {
  87                         if (!audit_compare_rule(rule, &e->rule)) {
  88                                 list_del(&e->list);
  89                                 write_unlock(&auditsc_lock);
  90                                 return 0;
  91                         }
  92                 }
  93                 write_unlock(&auditsc_lock);
  94                 return -EFAULT;         /* No matching rule */
  95         }
  96
  97         static inline int audit_add_rule(struct audit_entry *entry,
  98                                          struct list_head *list)
  99         {
 100                 write_lock(&auditsc_lock);
 101                 if (entry->rule.flags & AUDIT_PREPEND) {
 102                         entry->rule.flags &= ~AUDIT_PREPEND;
 103                         list_add(&entry->list, list);
 104                 } else {
 105                         list_add_tail(&entry->list, list);
 106                 }
 107                 write_unlock(&auditsc_lock);
 108                 return 0;
 109         }
 110
 111 Following are the RCU equivalents for these two functions::
 112
 113         static inline int audit_del_rule(struct audit_rule *rule,
 114                                          struct list_head *list)
 115         {
 116                 struct audit_entry  *e;
 117
 118                 /* Do not use the _rcu iterator here, since this is the only
 119                  * deletion routine. */
 120                 list_for_each_entry(e, list, list) {
 121                         if (!audit_compare_rule(rule, &e->rule)) {
 122                                 list_del_rcu(&e->list);
 123                                 call_rcu(&e->rcu, audit_free_rule);
 124                                 return 0;
 125                         }
 126                 }
 127                 return -EFAULT;         /* No matching rule */
 128         }
 129
 130         static inline int audit_add_rule(struct audit_entry *entry,
 131                                          struct list_head *list)
 132         {
 133                 if (entry->rule.flags & AUDIT_PREPEND) {
 134                         entry->rule.flags &= ~AUDIT_PREPEND;
 135                         list_add_rcu(&entry->list, list);
 136                 } else {
 137                         list_add_tail_rcu(&entry->list, list);
 138                 }
 139                 return 0;
 140         }
 141
 142 Normally, the write_lock() and write_unlock() would be replaced by
 143 a spin_lock() and a spin_unlock(), but in this case, all callers hold
 144 audit_netlink_sem, so no additional locking is required.  The auditsc_lock
 145 can therefore be eliminated, since use of RCU eliminates the need for
 146 writers to exclude readers.  Normally, the write_lock() calls would
 147 be converted into spin_lock() calls.
 148
 149 The list_del(), list_add(), and list_add_tail() primitives have been
 150 replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
 151 The _rcu() list-manipulation primitives add memory barriers that are
 152 needed on weakly ordered CPUs (most of them!).  The list_del_rcu()
 153 primitive omits the pointer poisoning debug-assist code that would
 154 otherwise cause concurrent readers to fail spectacularly.
 155
 156 So, when readers can tolerate stale data and when entries are either added
 157 or deleted, without in-place modification, it is very easy to use RCU!
 158
 159 Example 2: Handling In-Place Updates
 160 ------------------------------------
 161
 162 The system-call auditing code does not update auditing rules in place.
 163 However, if it did, reader-writer-locked code to do so might look as
 164 follows (presumably, the field_count is only permitted to decrease,
 165 otherwise, the added fields would need to be filled in)::
 166
 167         static inline int audit_upd_rule(struct audit_rule *rule,
 168                                          struct list_head *list,
 169                                          __u32 newaction,
 170                                          __u32 newfield_count)
 171         {
 172                 struct audit_entry  *e;
 173                 struct audit_newentry *ne;
 174
 175                 write_lock(&auditsc_lock);
 176                 /* Note: audit_netlink_sem held by caller. */
 177                 list_for_each_entry(e, list, list) {
 178                         if (!audit_compare_rule(rule, &e->rule)) {
 179                                 e->rule.action = newaction;
 180                                 e->rule.file_count = newfield_count;
 181                                 write_unlock(&auditsc_lock);
 182                                 return 0;
 183                         }
 184                 }
 185                 write_unlock(&auditsc_lock);
 186                 return -EFAULT;         /* No matching rule */
 187         }
 188
 189 The RCU version creates a copy, updates the copy, then replaces the old
 190 entry with the newly updated entry.  This sequence of actions, allowing
 191 concurrent reads while doing a copy to perform an update, is what gives
 192 RCU ("read-copy update") its name.  The RCU code is as follows::
 193
 194         static inline int audit_upd_rule(struct audit_rule *rule,
 195                                          struct list_head *list,
 196                                          __u32 newaction,
 197                                          __u32 newfield_count)
 198         {
 199                 struct audit_entry  *e;
 200                 struct audit_newentry *ne;
 201
 202                 list_for_each_entry(e, list, list) {
 203                         if (!audit_compare_rule(rule, &e->rule)) {
 204                                 ne = kmalloc(sizeof(*entry), GFP_ATOMIC);
 205                                 if (ne == NULL)
 206                                         return -ENOMEM;
 207                                 audit_copy_rule(&ne->rule, &e->rule);
 208                                 ne->rule.action = newaction;
 209                                 ne->rule.file_count = newfield_count;
 210                                 list_replace_rcu(&e->list, &ne->list);
 211                                 call_rcu(&e->rcu, audit_free_rule);
 212                                 return 0;
 213                         }
 214                 }
 215                 return -EFAULT;         /* No matching rule */
 216         }
 217
 218 Again, this assumes that the caller holds audit_netlink_sem.  Normally,
 219 the reader-writer lock would become a spinlock in this sort of code.
 220
 221 Example 3: Eliminating Stale Data
 222 ---------------------------------
 223
 224 The auditing examples above tolerate stale data, as do most algorithms
 225 that are tracking external state.  Because there is a delay from the
 226 time the external state changes before Linux becomes aware of the change,
 227 additional RCU-induced staleness is normally not a problem.
 228
 229 However, there are many examples where stale data cannot be tolerated.
 230 One example in the Linux kernel is the System V IPC (see the ipc_lock()
 231 function in ipc/util.c).  This code checks a "deleted" flag under a
 232 per-entry spinlock, and, if the "deleted" flag is set, pretends that the
 233 entry does not exist.  For this to be helpful, the search function must
 234 return holding the per-entry spinlock, as ipc_lock() does in fact do.
 235
 236 Quick Quiz:
 237         Why does the search function need to return holding the per-entry lock for
 238         this deleted-flag technique to be helpful?
 239
 240 :ref:`Answer to Quick Quiz <answer_quick_quiz_list>`
 241
 242 If the system-call audit module were to ever need to reject stale data,
 243 one way to accomplish this would be to add a "deleted" flag and a "lock"
 244 spinlock to the audit_entry structure, and modify audit_filter_task()
 245 as follows::
 246
 247         static enum audit_state audit_filter_task(struct task_struct *tsk)
 248         {
 249                 struct audit_entry *e;
 250                 enum audit_state   state;
 251
 252                 rcu_read_lock();
 253                 list_for_each_entry_rcu(e, &audit_tsklist, list) {
 254                         if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
 255                                 spin_lock(&e->lock);
 256                                 if (e->deleted) {
 257                                         spin_unlock(&e->lock);
 258                                         rcu_read_unlock();
 259                                         return AUDIT_BUILD_CONTEXT;
 260                                 }
 261                                 rcu_read_unlock();
 262                                 return state;
 263                         }
 264                 }
 265                 rcu_read_unlock();
 266                 return AUDIT_BUILD_CONTEXT;
 267         }
 268
 269 Note that this example assumes that entries are only added and deleted.
 270 Additional mechanism is required to deal correctly with the
 271 update-in-place performed by audit_upd_rule().  For one thing,
 272 audit_upd_rule() would need additional memory barriers to ensure
 273 that the list_add_rcu() was really executed before the list_del_rcu().
 274
 275 The audit_del_rule() function would need to set the "deleted"
 276 flag under the spinlock as follows::
 277
 278         static inline int audit_del_rule(struct audit_rule *rule,
 279                                          struct list_head *list)
 280         {
 281                 struct audit_entry  *e;
 282
 283                 /* Do not need to use the _rcu iterator here, since this
 284                  * is the only deletion routine. */
 285                 list_for_each_entry(e, list, list) {
 286                         if (!audit_compare_rule(rule, &e->rule)) {
 287                                 spin_lock(&e->lock);
 288                                 list_del_rcu(&e->list);
 289                                 e->deleted = 1;
 290                                 spin_unlock(&e->lock);
 291                                 call_rcu(&e->rcu, audit_free_rule);
 292                                 return 0;
 293                         }
 294                 }
 295                 return -EFAULT;         /* No matching rule */
 296         }
 297
 298 Summary
 299 -------
 300
 301 Read-mostly list-based data structures that can tolerate stale data are
 302 the most amenable to use of RCU.  The simplest case is where entries are
 303 either added or deleted from the data structure (or atomically modified
 304 in place), but non-atomic in-place modifications can be handled by making
 305 a copy, updating the copy, then replacing the original with the copy.
 306 If stale data cannot be tolerated, then a "deleted" flag may be used
 307 in conjunction with a per-entry spinlock in order to allow the search
 308 function to reject newly deleted data.
 309
 310 .. _answer_quick_quiz_list:
 311
 312 Answer to Quick Quiz:
 313         Why does the search function need to return holding the per-entry
 314         lock for this deleted-flag technique to be helpful?
 315
 316         If the search function drops the per-entry lock before returning,
 317         then the caller will be processing stale data in any case.  If it
 318         is really OK to be processing stale data, then you don't need a
 319         "deleted" flag.  If processing stale data really is a problem,
 320         then you need to hold the per-entry lock across all of the code
 321         that uses the value that was returned.