page_pool: fix inconsistency for page_pool_ring_[un]lock()
[sfrench/cifs-2.6.git] / net / core / page_pool.c
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * page_pool.c
4  *      Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
5  *      Copyright (C) 2016 Red Hat, Inc.
6  */
7
8 #include <linux/types.h>
9 #include <linux/kernel.h>
10 #include <linux/slab.h>
11 #include <linux/device.h>
12
13 #include <net/page_pool.h>
14 #include <net/xdp.h>
15
16 #include <linux/dma-direction.h>
17 #include <linux/dma-mapping.h>
18 #include <linux/page-flags.h>
19 #include <linux/mm.h> /* for put_page() */
20 #include <linux/poison.h>
21 #include <linux/ethtool.h>
22 #include <linux/netdevice.h>
23
24 #include <trace/events/page_pool.h>
25
26 #define DEFER_TIME (msecs_to_jiffies(1000))
27 #define DEFER_WARN_INTERVAL (60 * HZ)
28
29 #define BIAS_MAX        LONG_MAX
30
31 #ifdef CONFIG_PAGE_POOL_STATS
32 /* alloc_stat_inc is intended to be used in softirq context */
33 #define alloc_stat_inc(pool, __stat)    (pool->alloc_stats.__stat++)
34 /* recycle_stat_inc is safe to use when preemption is possible. */
35 #define recycle_stat_inc(pool, __stat)                                                  \
36         do {                                                                            \
37                 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;       \
38                 this_cpu_inc(s->__stat);                                                \
39         } while (0)
40
41 #define recycle_stat_add(pool, __stat, val)                                             \
42         do {                                                                            \
43                 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;       \
44                 this_cpu_add(s->__stat, val);                                           \
45         } while (0)
46
47 static const char pp_stats[][ETH_GSTRING_LEN] = {
48         "rx_pp_alloc_fast",
49         "rx_pp_alloc_slow",
50         "rx_pp_alloc_slow_ho",
51         "rx_pp_alloc_empty",
52         "rx_pp_alloc_refill",
53         "rx_pp_alloc_waive",
54         "rx_pp_recycle_cached",
55         "rx_pp_recycle_cache_full",
56         "rx_pp_recycle_ring",
57         "rx_pp_recycle_ring_full",
58         "rx_pp_recycle_released_ref",
59 };
60
61 bool page_pool_get_stats(struct page_pool *pool,
62                          struct page_pool_stats *stats)
63 {
64         int cpu = 0;
65
66         if (!stats)
67                 return false;
68
69         /* The caller is responsible to initialize stats. */
70         stats->alloc_stats.fast += pool->alloc_stats.fast;
71         stats->alloc_stats.slow += pool->alloc_stats.slow;
72         stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
73         stats->alloc_stats.empty += pool->alloc_stats.empty;
74         stats->alloc_stats.refill += pool->alloc_stats.refill;
75         stats->alloc_stats.waive += pool->alloc_stats.waive;
76
77         for_each_possible_cpu(cpu) {
78                 const struct page_pool_recycle_stats *pcpu =
79                         per_cpu_ptr(pool->recycle_stats, cpu);
80
81                 stats->recycle_stats.cached += pcpu->cached;
82                 stats->recycle_stats.cache_full += pcpu->cache_full;
83                 stats->recycle_stats.ring += pcpu->ring;
84                 stats->recycle_stats.ring_full += pcpu->ring_full;
85                 stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
86         }
87
88         return true;
89 }
90 EXPORT_SYMBOL(page_pool_get_stats);
91
92 u8 *page_pool_ethtool_stats_get_strings(u8 *data)
93 {
94         int i;
95
96         for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
97                 memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
98                 data += ETH_GSTRING_LEN;
99         }
100
101         return data;
102 }
103 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
104
105 int page_pool_ethtool_stats_get_count(void)
106 {
107         return ARRAY_SIZE(pp_stats);
108 }
109 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
110
111 u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
112 {
113         struct page_pool_stats *pool_stats = stats;
114
115         *data++ = pool_stats->alloc_stats.fast;
116         *data++ = pool_stats->alloc_stats.slow;
117         *data++ = pool_stats->alloc_stats.slow_high_order;
118         *data++ = pool_stats->alloc_stats.empty;
119         *data++ = pool_stats->alloc_stats.refill;
120         *data++ = pool_stats->alloc_stats.waive;
121         *data++ = pool_stats->recycle_stats.cached;
122         *data++ = pool_stats->recycle_stats.cache_full;
123         *data++ = pool_stats->recycle_stats.ring;
124         *data++ = pool_stats->recycle_stats.ring_full;
125         *data++ = pool_stats->recycle_stats.released_refcnt;
126
127         return data;
128 }
129 EXPORT_SYMBOL(page_pool_ethtool_stats_get);
130
131 #else
132 #define alloc_stat_inc(pool, __stat)
133 #define recycle_stat_inc(pool, __stat)
134 #define recycle_stat_add(pool, __stat, val)
135 #endif
136
137 static bool page_pool_producer_lock(struct page_pool *pool)
138         __acquires(&pool->ring.producer_lock)
139 {
140         bool in_softirq = in_softirq();
141
142         if (in_softirq)
143                 spin_lock(&pool->ring.producer_lock);
144         else
145                 spin_lock_bh(&pool->ring.producer_lock);
146
147         return in_softirq;
148 }
149
150 static void page_pool_producer_unlock(struct page_pool *pool,
151                                       bool in_softirq)
152         __releases(&pool->ring.producer_lock)
153 {
154         if (in_softirq)
155                 spin_unlock(&pool->ring.producer_lock);
156         else
157                 spin_unlock_bh(&pool->ring.producer_lock);
158 }
159
160 static int page_pool_init(struct page_pool *pool,
161                           const struct page_pool_params *params)
162 {
163         unsigned int ring_qsize = 1024; /* Default */
164
165         memcpy(&pool->p, params, sizeof(pool->p));
166
167         /* Validate only known flags were used */
168         if (pool->p.flags & ~(PP_FLAG_ALL))
169                 return -EINVAL;
170
171         if (pool->p.pool_size)
172                 ring_qsize = pool->p.pool_size;
173
174         /* Sanity limit mem that can be pinned down */
175         if (ring_qsize > 32768)
176                 return -E2BIG;
177
178         /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
179          * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
180          * which is the XDP_TX use-case.
181          */
182         if (pool->p.flags & PP_FLAG_DMA_MAP) {
183                 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
184                     (pool->p.dma_dir != DMA_BIDIRECTIONAL))
185                         return -EINVAL;
186         }
187
188         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
189                 /* In order to request DMA-sync-for-device the page
190                  * needs to be mapped
191                  */
192                 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
193                         return -EINVAL;
194
195                 if (!pool->p.max_len)
196                         return -EINVAL;
197
198                 /* pool->p.offset has to be set according to the address
199                  * offset used by the DMA engine to start copying rx data
200                  */
201         }
202
203         if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
204             pool->p.flags & PP_FLAG_PAGE_FRAG)
205                 return -EINVAL;
206
207 #ifdef CONFIG_PAGE_POOL_STATS
208         pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
209         if (!pool->recycle_stats)
210                 return -ENOMEM;
211 #endif
212
213         if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
214                 return -ENOMEM;
215
216         atomic_set(&pool->pages_state_release_cnt, 0);
217
218         /* Driver calling page_pool_create() also call page_pool_destroy() */
219         refcount_set(&pool->user_cnt, 1);
220
221         if (pool->p.flags & PP_FLAG_DMA_MAP)
222                 get_device(pool->p.dev);
223
224         return 0;
225 }
226
227 struct page_pool *page_pool_create(const struct page_pool_params *params)
228 {
229         struct page_pool *pool;
230         int err;
231
232         pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
233         if (!pool)
234                 return ERR_PTR(-ENOMEM);
235
236         err = page_pool_init(pool, params);
237         if (err < 0) {
238                 pr_warn("%s() gave up with errno %d\n", __func__, err);
239                 kfree(pool);
240                 return ERR_PTR(err);
241         }
242
243         return pool;
244 }
245 EXPORT_SYMBOL(page_pool_create);
246
247 static void page_pool_return_page(struct page_pool *pool, struct page *page);
248
249 noinline
250 static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
251 {
252         struct ptr_ring *r = &pool->ring;
253         struct page *page;
254         int pref_nid; /* preferred NUMA node */
255
256         /* Quicker fallback, avoid locks when ring is empty */
257         if (__ptr_ring_empty(r)) {
258                 alloc_stat_inc(pool, empty);
259                 return NULL;
260         }
261
262         /* Softirq guarantee CPU and thus NUMA node is stable. This,
263          * assumes CPU refilling driver RX-ring will also run RX-NAPI.
264          */
265 #ifdef CONFIG_NUMA
266         pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
267 #else
268         /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
269         pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
270 #endif
271
272         /* Refill alloc array, but only if NUMA match */
273         do {
274                 page = __ptr_ring_consume(r);
275                 if (unlikely(!page))
276                         break;
277
278                 if (likely(page_to_nid(page) == pref_nid)) {
279                         pool->alloc.cache[pool->alloc.count++] = page;
280                 } else {
281                         /* NUMA mismatch;
282                          * (1) release 1 page to page-allocator and
283                          * (2) break out to fallthrough to alloc_pages_node.
284                          * This limit stress on page buddy alloactor.
285                          */
286                         page_pool_return_page(pool, page);
287                         alloc_stat_inc(pool, waive);
288                         page = NULL;
289                         break;
290                 }
291         } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
292
293         /* Return last page */
294         if (likely(pool->alloc.count > 0)) {
295                 page = pool->alloc.cache[--pool->alloc.count];
296                 alloc_stat_inc(pool, refill);
297         }
298
299         return page;
300 }
301
302 /* fast path */
303 static struct page *__page_pool_get_cached(struct page_pool *pool)
304 {
305         struct page *page;
306
307         /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
308         if (likely(pool->alloc.count)) {
309                 /* Fast-path */
310                 page = pool->alloc.cache[--pool->alloc.count];
311                 alloc_stat_inc(pool, fast);
312         } else {
313                 page = page_pool_refill_alloc_cache(pool);
314         }
315
316         return page;
317 }
318
319 static void page_pool_dma_sync_for_device(struct page_pool *pool,
320                                           struct page *page,
321                                           unsigned int dma_sync_size)
322 {
323         dma_addr_t dma_addr = page_pool_get_dma_addr(page);
324
325         dma_sync_size = min(dma_sync_size, pool->p.max_len);
326         dma_sync_single_range_for_device(pool->p.dev, dma_addr,
327                                          pool->p.offset, dma_sync_size,
328                                          pool->p.dma_dir);
329 }
330
331 static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
332 {
333         dma_addr_t dma;
334
335         /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
336          * since dma_addr_t can be either 32 or 64 bits and does not always fit
337          * into page private data (i.e 32bit cpu with 64bit DMA caps)
338          * This mapping is kept for lifetime of page, until leaving pool.
339          */
340         dma = dma_map_page_attrs(pool->p.dev, page, 0,
341                                  (PAGE_SIZE << pool->p.order),
342                                  pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
343                                                   DMA_ATTR_WEAK_ORDERING);
344         if (dma_mapping_error(pool->p.dev, dma))
345                 return false;
346
347         page_pool_set_dma_addr(page, dma);
348
349         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
350                 page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
351
352         return true;
353 }
354
355 static void page_pool_set_pp_info(struct page_pool *pool,
356                                   struct page *page)
357 {
358         page->pp = pool;
359         page->pp_magic |= PP_SIGNATURE;
360         if (pool->p.init_callback)
361                 pool->p.init_callback(page, pool->p.init_arg);
362 }
363
364 static void page_pool_clear_pp_info(struct page *page)
365 {
366         page->pp_magic = 0;
367         page->pp = NULL;
368 }
369
370 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
371                                                  gfp_t gfp)
372 {
373         struct page *page;
374
375         gfp |= __GFP_COMP;
376         page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
377         if (unlikely(!page))
378                 return NULL;
379
380         if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
381             unlikely(!page_pool_dma_map(pool, page))) {
382                 put_page(page);
383                 return NULL;
384         }
385
386         alloc_stat_inc(pool, slow_high_order);
387         page_pool_set_pp_info(pool, page);
388
389         /* Track how many pages are held 'in-flight' */
390         pool->pages_state_hold_cnt++;
391         trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
392         return page;
393 }
394
395 /* slow path */
396 noinline
397 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
398                                                  gfp_t gfp)
399 {
400         const int bulk = PP_ALLOC_CACHE_REFILL;
401         unsigned int pp_flags = pool->p.flags;
402         unsigned int pp_order = pool->p.order;
403         struct page *page;
404         int i, nr_pages;
405
406         /* Don't support bulk alloc for high-order pages */
407         if (unlikely(pp_order))
408                 return __page_pool_alloc_page_order(pool, gfp);
409
410         /* Unnecessary as alloc cache is empty, but guarantees zero count */
411         if (unlikely(pool->alloc.count > 0))
412                 return pool->alloc.cache[--pool->alloc.count];
413
414         /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
415         memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
416
417         nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
418                                                pool->alloc.cache);
419         if (unlikely(!nr_pages))
420                 return NULL;
421
422         /* Pages have been filled into alloc.cache array, but count is zero and
423          * page element have not been (possibly) DMA mapped.
424          */
425         for (i = 0; i < nr_pages; i++) {
426                 page = pool->alloc.cache[i];
427                 if ((pp_flags & PP_FLAG_DMA_MAP) &&
428                     unlikely(!page_pool_dma_map(pool, page))) {
429                         put_page(page);
430                         continue;
431                 }
432
433                 page_pool_set_pp_info(pool, page);
434                 pool->alloc.cache[pool->alloc.count++] = page;
435                 /* Track how many pages are held 'in-flight' */
436                 pool->pages_state_hold_cnt++;
437                 trace_page_pool_state_hold(pool, page,
438                                            pool->pages_state_hold_cnt);
439         }
440
441         /* Return last page */
442         if (likely(pool->alloc.count > 0)) {
443                 page = pool->alloc.cache[--pool->alloc.count];
444                 alloc_stat_inc(pool, slow);
445         } else {
446                 page = NULL;
447         }
448
449         /* When page just alloc'ed is should/must have refcnt 1. */
450         return page;
451 }
452
453 /* For using page_pool replace: alloc_pages() API calls, but provide
454  * synchronization guarantee for allocation side.
455  */
456 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
457 {
458         struct page *page;
459
460         /* Fast-path: Get a page from cache */
461         page = __page_pool_get_cached(pool);
462         if (page)
463                 return page;
464
465         /* Slow-path: cache empty, do real allocation */
466         page = __page_pool_alloc_pages_slow(pool, gfp);
467         return page;
468 }
469 EXPORT_SYMBOL(page_pool_alloc_pages);
470
471 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
472  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
473  */
474 #define _distance(a, b) (s32)((a) - (b))
475
476 static s32 page_pool_inflight(struct page_pool *pool)
477 {
478         u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
479         u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
480         s32 inflight;
481
482         inflight = _distance(hold_cnt, release_cnt);
483
484         trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
485         WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
486
487         return inflight;
488 }
489
490 /* Disconnects a page (from a page_pool).  API users can have a need
491  * to disconnect a page (from a page_pool), to allow it to be used as
492  * a regular page (that will eventually be returned to the normal
493  * page-allocator via put_page).
494  */
495 void page_pool_release_page(struct page_pool *pool, struct page *page)
496 {
497         dma_addr_t dma;
498         int count;
499
500         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
501                 /* Always account for inflight pages, even if we didn't
502                  * map them
503                  */
504                 goto skip_dma_unmap;
505
506         dma = page_pool_get_dma_addr(page);
507
508         /* When page is unmapped, it cannot be returned to our pool */
509         dma_unmap_page_attrs(pool->p.dev, dma,
510                              PAGE_SIZE << pool->p.order, pool->p.dma_dir,
511                              DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
512         page_pool_set_dma_addr(page, 0);
513 skip_dma_unmap:
514         page_pool_clear_pp_info(page);
515
516         /* This may be the last page returned, releasing the pool, so
517          * it is not safe to reference pool afterwards.
518          */
519         count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
520         trace_page_pool_state_release(pool, page, count);
521 }
522 EXPORT_SYMBOL(page_pool_release_page);
523
524 /* Return a page to the page allocator, cleaning up our state */
525 static void page_pool_return_page(struct page_pool *pool, struct page *page)
526 {
527         page_pool_release_page(pool, page);
528
529         put_page(page);
530         /* An optimization would be to call __free_pages(page, pool->p.order)
531          * knowing page is not part of page-cache (thus avoiding a
532          * __page_cache_release() call).
533          */
534 }
535
536 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
537 {
538         int ret;
539         /* BH protection not needed if current is softirq */
540         if (in_softirq())
541                 ret = ptr_ring_produce(&pool->ring, page);
542         else
543                 ret = ptr_ring_produce_bh(&pool->ring, page);
544
545         if (!ret) {
546                 recycle_stat_inc(pool, ring);
547                 return true;
548         }
549
550         return false;
551 }
552
553 /* Only allow direct recycling in special circumstances, into the
554  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
555  *
556  * Caller must provide appropriate safe context.
557  */
558 static bool page_pool_recycle_in_cache(struct page *page,
559                                        struct page_pool *pool)
560 {
561         if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
562                 recycle_stat_inc(pool, cache_full);
563                 return false;
564         }
565
566         /* Caller MUST have verified/know (page_ref_count(page) == 1) */
567         pool->alloc.cache[pool->alloc.count++] = page;
568         recycle_stat_inc(pool, cached);
569         return true;
570 }
571
572 /* If the page refcnt == 1, this will try to recycle the page.
573  * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
574  * the configured size min(dma_sync_size, pool->max_len).
575  * If the page refcnt != 1, then the page will be returned to memory
576  * subsystem.
577  */
578 static __always_inline struct page *
579 __page_pool_put_page(struct page_pool *pool, struct page *page,
580                      unsigned int dma_sync_size, bool allow_direct)
581 {
582         /* This allocator is optimized for the XDP mode that uses
583          * one-frame-per-page, but have fallbacks that act like the
584          * regular page allocator APIs.
585          *
586          * refcnt == 1 means page_pool owns page, and can recycle it.
587          *
588          * page is NOT reusable when allocated when system is under
589          * some pressure. (page_is_pfmemalloc)
590          */
591         if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
592                 /* Read barrier done in page_ref_count / READ_ONCE */
593
594                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
595                         page_pool_dma_sync_for_device(pool, page,
596                                                       dma_sync_size);
597
598                 if (allow_direct && in_softirq() &&
599                     page_pool_recycle_in_cache(page, pool))
600                         return NULL;
601
602                 /* Page found as candidate for recycling */
603                 return page;
604         }
605         /* Fallback/non-XDP mode: API user have elevated refcnt.
606          *
607          * Many drivers split up the page into fragments, and some
608          * want to keep doing this to save memory and do refcnt based
609          * recycling. Support this use case too, to ease drivers
610          * switching between XDP/non-XDP.
611          *
612          * In-case page_pool maintains the DMA mapping, API user must
613          * call page_pool_put_page once.  In this elevated refcnt
614          * case, the DMA is unmapped/released, as driver is likely
615          * doing refcnt based recycle tricks, meaning another process
616          * will be invoking put_page.
617          */
618         recycle_stat_inc(pool, released_refcnt);
619         /* Do not replace this with page_pool_return_page() */
620         page_pool_release_page(pool, page);
621         put_page(page);
622
623         return NULL;
624 }
625
626 void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
627                                   unsigned int dma_sync_size, bool allow_direct)
628 {
629         page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
630         if (page && !page_pool_recycle_in_ring(pool, page)) {
631                 /* Cache full, fallback to free pages */
632                 recycle_stat_inc(pool, ring_full);
633                 page_pool_return_page(pool, page);
634         }
635 }
636 EXPORT_SYMBOL(page_pool_put_defragged_page);
637
638 /* Caller must not use data area after call, as this function overwrites it */
639 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
640                              int count)
641 {
642         int i, bulk_len = 0;
643         bool in_softirq;
644
645         for (i = 0; i < count; i++) {
646                 struct page *page = virt_to_head_page(data[i]);
647
648                 /* It is not the last user for the page frag case */
649                 if (!page_pool_is_last_frag(pool, page))
650                         continue;
651
652                 page = __page_pool_put_page(pool, page, -1, false);
653                 /* Approved for bulk recycling in ptr_ring cache */
654                 if (page)
655                         data[bulk_len++] = page;
656         }
657
658         if (unlikely(!bulk_len))
659                 return;
660
661         /* Bulk producer into ptr_ring page_pool cache */
662         in_softirq = page_pool_producer_lock(pool);
663         for (i = 0; i < bulk_len; i++) {
664                 if (__ptr_ring_produce(&pool->ring, data[i])) {
665                         /* ring full */
666                         recycle_stat_inc(pool, ring_full);
667                         break;
668                 }
669         }
670         recycle_stat_add(pool, ring, i);
671         page_pool_producer_unlock(pool, in_softirq);
672
673         /* Hopefully all pages was return into ptr_ring */
674         if (likely(i == bulk_len))
675                 return;
676
677         /* ptr_ring cache full, free remaining pages outside producer lock
678          * since put_page() with refcnt == 1 can be an expensive operation
679          */
680         for (; i < bulk_len; i++)
681                 page_pool_return_page(pool, data[i]);
682 }
683 EXPORT_SYMBOL(page_pool_put_page_bulk);
684
685 static struct page *page_pool_drain_frag(struct page_pool *pool,
686                                          struct page *page)
687 {
688         long drain_count = BIAS_MAX - pool->frag_users;
689
690         /* Some user is still using the page frag */
691         if (likely(page_pool_defrag_page(page, drain_count)))
692                 return NULL;
693
694         if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
695                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
696                         page_pool_dma_sync_for_device(pool, page, -1);
697
698                 return page;
699         }
700
701         page_pool_return_page(pool, page);
702         return NULL;
703 }
704
705 static void page_pool_free_frag(struct page_pool *pool)
706 {
707         long drain_count = BIAS_MAX - pool->frag_users;
708         struct page *page = pool->frag_page;
709
710         pool->frag_page = NULL;
711
712         if (!page || page_pool_defrag_page(page, drain_count))
713                 return;
714
715         page_pool_return_page(pool, page);
716 }
717
718 struct page *page_pool_alloc_frag(struct page_pool *pool,
719                                   unsigned int *offset,
720                                   unsigned int size, gfp_t gfp)
721 {
722         unsigned int max_size = PAGE_SIZE << pool->p.order;
723         struct page *page = pool->frag_page;
724
725         if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
726                     size > max_size))
727                 return NULL;
728
729         size = ALIGN(size, dma_get_cache_alignment());
730         *offset = pool->frag_offset;
731
732         if (page && *offset + size > max_size) {
733                 page = page_pool_drain_frag(pool, page);
734                 if (page) {
735                         alloc_stat_inc(pool, fast);
736                         goto frag_reset;
737                 }
738         }
739
740         if (!page) {
741                 page = page_pool_alloc_pages(pool, gfp);
742                 if (unlikely(!page)) {
743                         pool->frag_page = NULL;
744                         return NULL;
745                 }
746
747                 pool->frag_page = page;
748
749 frag_reset:
750                 pool->frag_users = 1;
751                 *offset = 0;
752                 pool->frag_offset = size;
753                 page_pool_fragment_page(page, BIAS_MAX);
754                 return page;
755         }
756
757         pool->frag_users++;
758         pool->frag_offset = *offset + size;
759         alloc_stat_inc(pool, fast);
760         return page;
761 }
762 EXPORT_SYMBOL(page_pool_alloc_frag);
763
764 static void page_pool_empty_ring(struct page_pool *pool)
765 {
766         struct page *page;
767
768         /* Empty recycle ring */
769         while ((page = ptr_ring_consume_bh(&pool->ring))) {
770                 /* Verify the refcnt invariant of cached pages */
771                 if (!(page_ref_count(page) == 1))
772                         pr_crit("%s() page_pool refcnt %d violation\n",
773                                 __func__, page_ref_count(page));
774
775                 page_pool_return_page(pool, page);
776         }
777 }
778
779 static void page_pool_free(struct page_pool *pool)
780 {
781         if (pool->disconnect)
782                 pool->disconnect(pool);
783
784         ptr_ring_cleanup(&pool->ring, NULL);
785
786         if (pool->p.flags & PP_FLAG_DMA_MAP)
787                 put_device(pool->p.dev);
788
789 #ifdef CONFIG_PAGE_POOL_STATS
790         free_percpu(pool->recycle_stats);
791 #endif
792         kfree(pool);
793 }
794
795 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
796 {
797         struct page *page;
798
799         if (pool->destroy_cnt)
800                 return;
801
802         /* Empty alloc cache, assume caller made sure this is
803          * no-longer in use, and page_pool_alloc_pages() cannot be
804          * call concurrently.
805          */
806         while (pool->alloc.count) {
807                 page = pool->alloc.cache[--pool->alloc.count];
808                 page_pool_return_page(pool, page);
809         }
810 }
811
812 static void page_pool_scrub(struct page_pool *pool)
813 {
814         page_pool_empty_alloc_cache_once(pool);
815         pool->destroy_cnt++;
816
817         /* No more consumers should exist, but producers could still
818          * be in-flight.
819          */
820         page_pool_empty_ring(pool);
821 }
822
823 static int page_pool_release(struct page_pool *pool)
824 {
825         int inflight;
826
827         page_pool_scrub(pool);
828         inflight = page_pool_inflight(pool);
829         if (!inflight)
830                 page_pool_free(pool);
831
832         return inflight;
833 }
834
835 static void page_pool_release_retry(struct work_struct *wq)
836 {
837         struct delayed_work *dwq = to_delayed_work(wq);
838         struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
839         int inflight;
840
841         inflight = page_pool_release(pool);
842         if (!inflight)
843                 return;
844
845         /* Periodic warning */
846         if (time_after_eq(jiffies, pool->defer_warn)) {
847                 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
848
849                 pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
850                         __func__, inflight, sec);
851                 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
852         }
853
854         /* Still not ready to be disconnected, retry later */
855         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
856 }
857
858 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
859                            struct xdp_mem_info *mem)
860 {
861         refcount_inc(&pool->user_cnt);
862         pool->disconnect = disconnect;
863         pool->xdp_mem_id = mem->id;
864 }
865
866 void page_pool_unlink_napi(struct page_pool *pool)
867 {
868         if (!pool->p.napi)
869                 return;
870
871         /* To avoid races with recycling and additional barriers make sure
872          * pool and NAPI are unlinked when NAPI is disabled.
873          */
874         WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
875                 READ_ONCE(pool->p.napi->list_owner) != -1);
876
877         WRITE_ONCE(pool->p.napi, NULL);
878 }
879 EXPORT_SYMBOL(page_pool_unlink_napi);
880
881 void page_pool_destroy(struct page_pool *pool)
882 {
883         if (!pool)
884                 return;
885
886         if (!page_pool_put(pool))
887                 return;
888
889         page_pool_unlink_napi(pool);
890         page_pool_free_frag(pool);
891
892         if (!page_pool_release(pool))
893                 return;
894
895         pool->defer_start = jiffies;
896         pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
897
898         INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
899         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
900 }
901 EXPORT_SYMBOL(page_pool_destroy);
902
903 /* Caller must provide appropriate safe context, e.g. NAPI. */
904 void page_pool_update_nid(struct page_pool *pool, int new_nid)
905 {
906         struct page *page;
907
908         trace_page_pool_update_nid(pool, new_nid);
909         pool->p.nid = new_nid;
910
911         /* Flush pool alloc cache, as refill will check NUMA node */
912         while (pool->alloc.count) {
913                 page = pool->alloc.cache[--pool->alloc.count];
914                 page_pool_return_page(pool, page);
915         }
916 }
917 EXPORT_SYMBOL(page_pool_update_nid);
918
919 bool page_pool_return_skb_page(struct page *page, bool napi_safe)
920 {
921         struct napi_struct *napi;
922         struct page_pool *pp;
923         bool allow_direct;
924
925         page = compound_head(page);
926
927         /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
928          * in order to preserve any existing bits, such as bit 0 for the
929          * head page of compound page and bit 1 for pfmemalloc page, so
930          * mask those bits for freeing side when doing below checking,
931          * and page_is_pfmemalloc() is checked in __page_pool_put_page()
932          * to avoid recycling the pfmemalloc page.
933          */
934         if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
935                 return false;
936
937         pp = page->pp;
938
939         /* Allow direct recycle if we have reasons to believe that we are
940          * in the same context as the consumer would run, so there's
941          * no possible race.
942          */
943         napi = READ_ONCE(pp->p.napi);
944         allow_direct = napi_safe && napi &&
945                 READ_ONCE(napi->list_owner) == smp_processor_id();
946
947         /* Driver set this to memory recycling info. Reset it on recycle.
948          * This will *not* work for NIC using a split-page memory model.
949          * The page will be returned to the pool here regardless of the
950          * 'flipped' fragment being in use or not.
951          */
952         page_pool_put_full_page(pp, page, allow_direct);
953
954         return true;
955 }
956 EXPORT_SYMBOL(page_pool_return_skb_page);