Documentation/RCU/listRCU.txt

   1 Using RCU to Protect Read-Mostly Linked Lists
   2
   3
   4 One of the best applications of RCU is to protect read-mostly linked lists
   5 ("struct list_head" in list.h).  One big advantage of this approach
   6 is that all of the required memory barriers are included for you in
   7 the list macros.  This document describes several applications of RCU,
   8 with the best fits first.
   9
  10
  11 Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates
  12
  13 The best applications are cases where, if reader-writer locking were
  14 used, the read-side lock would be dropped before taking any action
  15 based on the results of the search.  The most celebrated example is
  16 the routing table.  Because the routing table is tracking the state of
  17 equipment outside of the computer, it will at times contain stale data.
  18 Therefore, once the route has been computed, there is no need to hold
  19 the routing table static during transmission of the packet.  After all,
  20 you can hold the routing table static all you want, but that won't keep
  21 the external Internet from changing, and it is the state of the external
  22 Internet that really matters.  In addition, routing entries are typically
  23 added or deleted, rather than being modified in place.
  24
  25 A straightforward example of this use of RCU may be found in the
  26 system-call auditing support.  For example, a reader-writer locked
  27 implementation of audit_filter_task() might be as follows:
  28
  29         static enum audit_state audit_filter_task(struct task_struct *tsk)
  30         {
  31                 struct audit_entry *e;
  32                 enum audit_state   state;
  33
  34                 read_lock(&auditsc_lock);
  35                 list_for_each_entry(e, &audit_tsklist, list) {
  36                         if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
  37                                 read_unlock(&auditsc_lock);
  38                                 return state;
  39                         }
  40                 }
  41                 read_unlock(&auditsc_lock);
  42                 return AUDIT_BUILD_CONTEXT;
  43         }
  44
  45 Here the list is searched under the lock, but the lock is dropped before
  46 the corresponding value is returned.  By the time that this value is acted
  47 on, the list may well have been modified.  This makes sense, since if
  48 you are turning auditing off, it is OK to audit a few extra system calls.
  49
  50 This means that RCU can be easily applied to the read side, as follows:
  51
  52         static enum audit_state audit_filter_task(struct task_struct *tsk)
  53         {
  54                 struct audit_entry *e;
  55                 enum audit_state   state;
  56
  57                 rcu_read_lock();
  58                 list_for_each_entry_rcu(e, &audit_tsklist, list) {
  59                         if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
  60                                 rcu_read_unlock();
  61                                 return state;
  62                         }
  63                 }
  64                 rcu_read_unlock();
  65                 return AUDIT_BUILD_CONTEXT;
  66         }
  67
  68 The read_lock() and read_unlock() calls have become rcu_read_lock()
  69 and rcu_read_unlock(), respectively, and the list_for_each_entry() has
  70 become list_for_each_entry_rcu().  The _rcu() list-traversal primitives
  71 insert the read-side memory barriers that are required on DEC Alpha CPUs.
  72
  73 The changes to the update side are also straightforward.  A reader-writer
  74 lock might be used as follows for deletion and insertion:
  75
  76         static inline int audit_del_rule(struct audit_rule *rule,
  77                                          struct list_head *list)
  78         {
  79                 struct audit_entry  *e;
  80
  81                 write_lock(&auditsc_lock);
  82                 list_for_each_entry(e, list, list) {
  83                         if (!audit_compare_rule(rule, &e->rule)) {
  84                                 list_del(&e->list);
  85                                 write_unlock(&auditsc_lock);
  86                                 return 0;
  87                         }
  88                 }
  89                 write_unlock(&auditsc_lock);
  90                 return -EFAULT;         /* No matching rule */
  91         }
  92
  93         static inline int audit_add_rule(struct audit_entry *entry,
  94                                          struct list_head *list)
  95         {
  96                 write_lock(&auditsc_lock);
  97                 if (entry->rule.flags & AUDIT_PREPEND) {
  98                         entry->rule.flags &= ~AUDIT_PREPEND;
  99                         list_add(&entry->list, list);
 100                 } else {
 101                         list_add_tail(&entry->list, list);
 102                 }
 103                 write_unlock(&auditsc_lock);
 104                 return 0;
 105         }
 106
 107 Following are the RCU equivalents for these two functions:
 108
 109         static inline int audit_del_rule(struct audit_rule *rule,
 110                                          struct list_head *list)
 111         {
 112                 struct audit_entry  *e;
 113
 114                 /* Do not use the _rcu iterator here, since this is the only
 115                  * deletion routine. */
 116                 list_for_each_entry(e, list, list) {
 117                         if (!audit_compare_rule(rule, &e->rule)) {
 118                                 list_del_rcu(&e->list);
 119                                 call_rcu(&e->rcu, audit_free_rule, e);
 120                                 return 0;
 121                         }
 122                 }
 123                 return -EFAULT;         /* No matching rule */
 124         }
 125
 126         static inline int audit_add_rule(struct audit_entry *entry,
 127                                          struct list_head *list)
 128         {
 129                 if (entry->rule.flags & AUDIT_PREPEND) {
 130                         entry->rule.flags &= ~AUDIT_PREPEND;
 131                         list_add_rcu(&entry->list, list);
 132                 } else {
 133                         list_add_tail_rcu(&entry->list, list);
 134                 }
 135                 return 0;
 136         }
 137
 138 Normally, the write_lock() and write_unlock() would be replaced by
 139 a spin_lock() and a spin_unlock(), but in this case, all callers hold
 140 audit_netlink_sem, so no additional locking is required.  The auditsc_lock
 141 can therefore be eliminated, since use of RCU eliminates the need for
 142 writers to exclude readers.
 143
 144 The list_del(), list_add(), and list_add_tail() primitives have been
 145 replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
 146 The _rcu() list-manipulation primitives add memory barriers that are
 147 needed on weakly ordered CPUs (most of them!).
 148
 149 So, when readers can tolerate stale data and when entries are either added
 150 or deleted, without in-place modification, it is very easy to use RCU!
 151
 152
 153 Example 2: Handling In-Place Updates
 154
 155 The system-call auditing code does not update auditing rules in place.
 156 However, if it did, reader-writer-locked code to do so might look as
 157 follows (presumably, the field_count is only permitted to decrease,
 158 otherwise, the added fields would need to be filled in):
 159
 160         static inline int audit_upd_rule(struct audit_rule *rule,
 161                                          struct list_head *list,
 162                                          __u32 newaction,
 163                                          __u32 newfield_count)
 164         {
 165                 struct audit_entry  *e;
 166                 struct audit_newentry *ne;
 167
 168                 write_lock(&auditsc_lock);
 169                 list_for_each_entry(e, list, list) {
 170                         if (!audit_compare_rule(rule, &e->rule)) {
 171                                 e->rule.action = newaction;
 172                                 e->rule.file_count = newfield_count;
 173                                 write_unlock(&auditsc_lock);
 174                                 return 0;
 175                         }
 176                 }
 177                 write_unlock(&auditsc_lock);
 178                 return -EFAULT;         /* No matching rule */
 179         }
 180
 181 The RCU version creates a copy, updates the copy, then replaces the old
 182 entry with the newly updated entry.  This sequence of actions, allowing
 183 concurrent reads while doing a copy to perform an update, is what gives
 184 RCU ("read-copy update") its name.  The RCU code is as follows:
 185
 186         static inline int audit_upd_rule(struct audit_rule *rule,
 187                                          struct list_head *list,
 188                                          __u32 newaction,
 189                                          __u32 newfield_count)
 190         {
 191                 struct audit_entry  *e;
 192                 struct audit_newentry *ne;
 193
 194                 list_for_each_entry(e, list, list) {
 195                         if (!audit_compare_rule(rule, &e->rule)) {
 196                                 ne = kmalloc(sizeof(*entry), GFP_ATOMIC);
 197                                 if (ne == NULL)
 198                                         return -ENOMEM;
 199                                 audit_copy_rule(&ne->rule, &e->rule);
 200                                 ne->rule.action = newaction;
 201                                 ne->rule.file_count = newfield_count;
 202                                 list_add_rcu(ne, e);
 203                                 list_del(e);
 204                                 call_rcu(&e->rcu, audit_free_rule, e);
 205                                 return 0;
 206                         }
 207                 }
 208                 return -EFAULT;         /* No matching rule */
 209         }
 210
 211 Again, this assumes that the caller holds audit_netlink_sem.  Normally,
 212 the reader-writer lock would become a spinlock in this sort of code.
 213
 214
 215 Example 3: Eliminating Stale Data
 216
 217 The auditing examples above tolerate stale data, as do most algorithms
 218 that are tracking external state.  Because there is a delay from the
 219 time the external state changes before Linux becomes aware of the change,
 220 additional RCU-induced staleness is normally not a problem.
 221
 222 However, there are many examples where stale data cannot be tolerated.
 223 One example in the Linux kernel is the System V IPC (see the ipc_lock()
 224 function in ipc/util.c).  This code checks a "deleted" flag under a
 225 per-entry spinlock, and, if the "deleted" flag is set, pretends that the
 226 entry does not exist.  For this to be helpful, the search function must
 227 return holding the per-entry spinlock, as ipc_lock() does in fact do.
 228
 229 Quick Quiz:  Why does the search function need to return holding the
 230 per-entry lock for this deleted-flag technique to be helpful?
 231
 232 If the system-call audit module were to ever need to reject stale data,
 233 one way to accomplish this would be to add a "deleted" flag and a "lock"
 234 spinlock to the audit_entry structure, and modify audit_filter_task()
 235 as follows:
 236
 237         static enum audit_state audit_filter_task(struct task_struct *tsk)
 238         {
 239                 struct audit_entry *e;
 240                 enum audit_state   state;
 241
 242                 rcu_read_lock();
 243                 list_for_each_entry_rcu(e, &audit_tsklist, list) {
 244                         if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
 245                                 spin_lock(&e->lock);
 246                                 if (e->deleted) {
 247                                         spin_unlock(&e->lock);
 248                                         rcu_read_unlock();
 249                                         return AUDIT_BUILD_CONTEXT;
 250                                 }
 251                                 rcu_read_unlock();
 252                                 return state;
 253                         }
 254                 }
 255                 rcu_read_unlock();
 256                 return AUDIT_BUILD_CONTEXT;
 257         }
 258
 259 Note that this example assumes that entries are only added and deleted.
 260 Additional mechanism is required to deal correctly with the
 261 update-in-place performed by audit_upd_rule().  For one thing,
 262 audit_upd_rule() would need additional memory barriers to ensure
 263 that the list_add_rcu() was really executed before the list_del_rcu().
 264
 265 The audit_del_rule() function would need to set the "deleted"
 266 flag under the spinlock as follows:
 267
 268         static inline int audit_del_rule(struct audit_rule *rule,
 269                                          struct list_head *list)
 270         {
 271                 struct audit_entry  *e;
 272
 273                 /* Do not use the _rcu iterator here, since this is the only
 274                  * deletion routine. */
 275                 list_for_each_entry(e, list, list) {
 276                         if (!audit_compare_rule(rule, &e->rule)) {
 277                                 spin_lock(&e->lock);
 278                                 list_del_rcu(&e->list);
 279                                 e->deleted = 1;
 280                                 spin_unlock(&e->lock);
 281                                 call_rcu(&e->rcu, audit_free_rule, e);
 282                                 return 0;
 283                         }
 284                 }
 285                 return -EFAULT;         /* No matching rule */
 286         }
 287
 288
 289 Summary
 290
 291 Read-mostly list-based data structures that can tolerate stale data are
 292 the most amenable to use of RCU.  The simplest case is where entries are
 293 either added or deleted from the data structure (or atomically modified
 294 in place), but non-atomic in-place modifications can be handled by making
 295 a copy, updating the copy, then replacing the original with the copy.
 296 If stale data cannot be tolerated, then a "deleted" flag may be used
 297 in conjunction with a per-entry spinlock in order to allow the search
 298 function to reject newly deleted data.
 299
 300
 301 Answer to Quick Quiz
 302
 303 If the search function drops the per-entry lock before returning, then
 304 the caller will be processing stale data in any case.  If it is really
 305 OK to be processing stale data, then you don't need a "deleted" flag.
 306 If processing stale data really is a problem, then you need to hold the
 307 per-entry lock across all of the code that uses the value looked up.