net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117
 118 #include <linux/uaccess.h>
 119
 120 #include <linux/netdevice.h>
 121 #include <net/protocol.h>
 122 #include <linux/skbuff.h>
 123 #include <net/net_namespace.h>
 124 #include <net/request_sock.h>
 125 #include <net/sock.h>
 126 #include <linux/net_tstamp.h>
 127 #include <net/xfrm.h>
 128 #include <linux/ipsec.h>
 129 #include <net/cls_cgroup.h>
 130 #include <net/netprio_cgroup.h>
 131 #include <linux/sock_diag.h>
 132
 133 #include <linux/filter.h>
 134 #include <net/sock_reuseport.h>
 135 #include <net/bpf_sk_storage.h>
 136
 137 #include <trace/events/sock.h>
 138
 139 #include <net/tcp.h>
 140 #include <net/busy_poll.h>
 141
 142 #include <linux/ethtool.h>
 143
 144 #include "dev.h"
 145
 146 static DEFINE_MUTEX(proto_list_mutex);
 147 static LIST_HEAD(proto_list);
 148
 149 static void sock_def_write_space_wfree(struct sock *sk);
 150 static void sock_def_write_space(struct sock *sk);
 151
 152 /**
 153  * sk_ns_capable - General socket capability test
 154  * @sk: Socket to use a capability on or through
 155  * @user_ns: The user namespace of the capability to use
 156  * @cap: The capability to use
 157  *
 158  * Test to see if the opener of the socket had when the socket was
 159  * created and the current process has the capability @cap in the user
 160  * namespace @user_ns.
 161  */
 162 bool sk_ns_capable(const struct sock *sk,
 163                    struct user_namespace *user_ns, int cap)
 164 {
 165         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 166                 ns_capable(user_ns, cap);
 167 }
 168 EXPORT_SYMBOL(sk_ns_capable);
 169
 170 /**
 171  * sk_capable - Socket global capability test
 172  * @sk: Socket to use a capability on or through
 173  * @cap: The global capability to use
 174  *
 175  * Test to see if the opener of the socket had when the socket was
 176  * created and the current process has the capability @cap in all user
 177  * namespaces.
 178  */
 179 bool sk_capable(const struct sock *sk, int cap)
 180 {
 181         return sk_ns_capable(sk, &init_user_ns, cap);
 182 }
 183 EXPORT_SYMBOL(sk_capable);
 184
 185 /**
 186  * sk_net_capable - Network namespace socket capability test
 187  * @sk: Socket to use a capability on or through
 188  * @cap: The capability to use
 189  *
 190  * Test to see if the opener of the socket had when the socket was created
 191  * and the current process has the capability @cap over the network namespace
 192  * the socket is a member of.
 193  */
 194 bool sk_net_capable(const struct sock *sk, int cap)
 195 {
 196         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 197 }
 198 EXPORT_SYMBOL(sk_net_capable);
 199
 200 /*
 201  * Each address family might have different locking rules, so we have
 202  * one slock key per address family and separate keys for internal and
 203  * userspace sockets.
 204  */
 205 static struct lock_class_key af_family_keys[AF_MAX];
 206 static struct lock_class_key af_family_kern_keys[AF_MAX];
 207 static struct lock_class_key af_family_slock_keys[AF_MAX];
 208 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 209
 210 /*
 211  * Make lock validator output more readable. (we pre-construct these
 212  * strings build-time, so that runtime initialization of socket
 213  * locks is fast):
 214  */
 215
 216 #define _sock_locks(x)                                            \
 217   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 218   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 219   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 220   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 221   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 222   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 223   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 224   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 225   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 226   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 227   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 228   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 229   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 230   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 231   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 232   x "AF_MCTP"  , \
 233   x "AF_MAX"
 234
 235 static const char *const af_family_key_strings[AF_MAX+1] = {
 236         _sock_locks("sk_lock-")
 237 };
 238 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 239         _sock_locks("slock-")
 240 };
 241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 242         _sock_locks("clock-")
 243 };
 244
 245 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-sk_lock-")
 247 };
 248 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 249         _sock_locks("k-slock-")
 250 };
 251 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 252         _sock_locks("k-clock-")
 253 };
 254 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 255         _sock_locks("rlock-")
 256 };
 257 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 258         _sock_locks("wlock-")
 259 };
 260 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 261         _sock_locks("elock-")
 262 };
 263
 264 /*
 265  * sk_callback_lock and sk queues locking rules are per-address-family,
 266  * so split the lock classes by using a per-AF key:
 267  */
 268 static struct lock_class_key af_callback_keys[AF_MAX];
 269 static struct lock_class_key af_rlock_keys[AF_MAX];
 270 static struct lock_class_key af_wlock_keys[AF_MAX];
 271 static struct lock_class_key af_elock_keys[AF_MAX];
 272 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 273
 274 /* Run time adjustable parameters. */
 275 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 276 EXPORT_SYMBOL(sysctl_wmem_max);
 277 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 278 EXPORT_SYMBOL(sysctl_rmem_max);
 279 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 280 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 281
 282 /* Maximal space eaten by iovec or ancillary data plus some space */
 283 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 284 EXPORT_SYMBOL(sysctl_optmem_max);
 285
 286 int sysctl_tstamp_allow_data __read_mostly = 1;
 287
 288 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 289 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 290
 291 /**
 292  * sk_set_memalloc - sets %SOCK_MEMALLOC
 293  * @sk: socket to set it on
 294  *
 295  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 296  * It's the responsibility of the admin to adjust min_free_kbytes
 297  * to meet the requirements
 298  */
 299 void sk_set_memalloc(struct sock *sk)
 300 {
 301         sock_set_flag(sk, SOCK_MEMALLOC);
 302         sk->sk_allocation |= __GFP_MEMALLOC;
 303         static_branch_inc(&memalloc_socks_key);
 304 }
 305 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 306
 307 void sk_clear_memalloc(struct sock *sk)
 308 {
 309         sock_reset_flag(sk, SOCK_MEMALLOC);
 310         sk->sk_allocation &= ~__GFP_MEMALLOC;
 311         static_branch_dec(&memalloc_socks_key);
 312
 313         /*
 314          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 315          * progress of swapping. SOCK_MEMALLOC may be cleared while
 316          * it has rmem allocations due to the last swapfile being deactivated
 317          * but there is a risk that the socket is unusable due to exceeding
 318          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 319          */
 320         sk_mem_reclaim(sk);
 321 }
 322 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 323
 324 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 325 {
 326         int ret;
 327         unsigned int noreclaim_flag;
 328
 329         /* these should have been dropped before queueing */
 330         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 331
 332         noreclaim_flag = memalloc_noreclaim_save();
 333         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 334                                  tcp_v6_do_rcv,
 335                                  tcp_v4_do_rcv,
 336                                  sk, skb);
 337         memalloc_noreclaim_restore(noreclaim_flag);
 338
 339         return ret;
 340 }
 341 EXPORT_SYMBOL(__sk_backlog_rcv);
 342
 343 void sk_error_report(struct sock *sk)
 344 {
 345         sk->sk_error_report(sk);
 346
 347         switch (sk->sk_family) {
 348         case AF_INET:
 349                 fallthrough;
 350         case AF_INET6:
 351                 trace_inet_sk_error_report(sk);
 352                 break;
 353         default:
 354                 break;
 355         }
 356 }
 357 EXPORT_SYMBOL(sk_error_report);
 358
 359 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 360 {
 361         struct __kernel_sock_timeval tv;
 362
 363         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 364                 tv.tv_sec = 0;
 365                 tv.tv_usec = 0;
 366         } else {
 367                 tv.tv_sec = timeo / HZ;
 368                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 369         }
 370
 371         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 372                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 373                 *(struct old_timeval32 *)optval = tv32;
 374                 return sizeof(tv32);
 375         }
 376
 377         if (old_timeval) {
 378                 struct __kernel_old_timeval old_tv;
 379                 old_tv.tv_sec = tv.tv_sec;
 380                 old_tv.tv_usec = tv.tv_usec;
 381                 *(struct __kernel_old_timeval *)optval = old_tv;
 382                 return sizeof(old_tv);
 383         }
 384
 385         *(struct __kernel_sock_timeval *)optval = tv;
 386         return sizeof(tv);
 387 }
 388 EXPORT_SYMBOL(sock_get_timeout);
 389
 390 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 391                            sockptr_t optval, int optlen, bool old_timeval)
 392 {
 393         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 394                 struct old_timeval32 tv32;
 395
 396                 if (optlen < sizeof(tv32))
 397                         return -EINVAL;
 398
 399                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 400                         return -EFAULT;
 401                 tv->tv_sec = tv32.tv_sec;
 402                 tv->tv_usec = tv32.tv_usec;
 403         } else if (old_timeval) {
 404                 struct __kernel_old_timeval old_tv;
 405
 406                 if (optlen < sizeof(old_tv))
 407                         return -EINVAL;
 408                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 409                         return -EFAULT;
 410                 tv->tv_sec = old_tv.tv_sec;
 411                 tv->tv_usec = old_tv.tv_usec;
 412         } else {
 413                 if (optlen < sizeof(*tv))
 414                         return -EINVAL;
 415                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 416                         return -EFAULT;
 417         }
 418
 419         return 0;
 420 }
 421 EXPORT_SYMBOL(sock_copy_user_timeval);
 422
 423 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 424                             bool old_timeval)
 425 {
 426         struct __kernel_sock_timeval tv;
 427         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 428
 429         if (err)
 430                 return err;
 431
 432         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 433                 return -EDOM;
 434
 435         if (tv.tv_sec < 0) {
 436                 static int warned __read_mostly;
 437
 438                 *timeo_p = 0;
 439                 if (warned < 10 && net_ratelimit()) {
 440                         warned++;
 441                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 442                                 __func__, current->comm, task_pid_nr(current));
 443                 }
 444                 return 0;
 445         }
 446         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 447         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 448                 return 0;
 449         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 450                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 451         return 0;
 452 }
 453
 454 static bool sock_needs_netstamp(const struct sock *sk)
 455 {
 456         switch (sk->sk_family) {
 457         case AF_UNSPEC:
 458         case AF_UNIX:
 459                 return false;
 460         default:
 461                 return true;
 462         }
 463 }
 464
 465 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 466 {
 467         if (sk->sk_flags & flags) {
 468                 sk->sk_flags &= ~flags;
 469                 if (sock_needs_netstamp(sk) &&
 470                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 471                         net_disable_timestamp();
 472         }
 473 }
 474
 475
 476 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 477 {
 478         unsigned long flags;
 479         struct sk_buff_head *list = &sk->sk_receive_queue;
 480
 481         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 482                 atomic_inc(&sk->sk_drops);
 483                 trace_sock_rcvqueue_full(sk, skb);
 484                 return -ENOMEM;
 485         }
 486
 487         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 488                 atomic_inc(&sk->sk_drops);
 489                 return -ENOBUFS;
 490         }
 491
 492         skb->dev = NULL;
 493         skb_set_owner_r(skb, sk);
 494
 495         /* we escape from rcu protected region, make sure we dont leak
 496          * a norefcounted dst
 497          */
 498         skb_dst_force(skb);
 499
 500         spin_lock_irqsave(&list->lock, flags);
 501         sock_skb_set_dropcount(sk, skb);
 502         __skb_queue_tail(list, skb);
 503         spin_unlock_irqrestore(&list->lock, flags);
 504
 505         if (!sock_flag(sk, SOCK_DEAD))
 506                 sk->sk_data_ready(sk);
 507         return 0;
 508 }
 509 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 510
 511 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 512                               enum skb_drop_reason *reason)
 513 {
 514         enum skb_drop_reason drop_reason;
 515         int err;
 516
 517         err = sk_filter(sk, skb);
 518         if (err) {
 519                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 520                 goto out;
 521         }
 522         err = __sock_queue_rcv_skb(sk, skb);
 523         switch (err) {
 524         case -ENOMEM:
 525                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 526                 break;
 527         case -ENOBUFS:
 528                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 529                 break;
 530         default:
 531                 drop_reason = SKB_NOT_DROPPED_YET;
 532                 break;
 533         }
 534 out:
 535         if (reason)
 536                 *reason = drop_reason;
 537         return err;
 538 }
 539 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 540
 541 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 542                      const int nested, unsigned int trim_cap, bool refcounted)
 543 {
 544         int rc = NET_RX_SUCCESS;
 545
 546         if (sk_filter_trim_cap(sk, skb, trim_cap))
 547                 goto discard_and_relse;
 548
 549         skb->dev = NULL;
 550
 551         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 552                 atomic_inc(&sk->sk_drops);
 553                 goto discard_and_relse;
 554         }
 555         if (nested)
 556                 bh_lock_sock_nested(sk);
 557         else
 558                 bh_lock_sock(sk);
 559         if (!sock_owned_by_user(sk)) {
 560                 /*
 561                  * trylock + unlock semantics:
 562                  */
 563                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 564
 565                 rc = sk_backlog_rcv(sk, skb);
 566
 567                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 568         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 569                 bh_unlock_sock(sk);
 570                 atomic_inc(&sk->sk_drops);
 571                 goto discard_and_relse;
 572         }
 573
 574         bh_unlock_sock(sk);
 575 out:
 576         if (refcounted)
 577                 sock_put(sk);
 578         return rc;
 579 discard_and_relse:
 580         kfree_skb(skb);
 581         goto out;
 582 }
 583 EXPORT_SYMBOL(__sk_receive_skb);
 584
 585 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 586                                                           u32));
 587 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 588                                                            u32));
 589 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 590 {
 591         struct dst_entry *dst = __sk_dst_get(sk);
 592
 593         if (dst && dst->obsolete &&
 594             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 595                                dst, cookie) == NULL) {
 596                 sk_tx_queue_clear(sk);
 597                 sk->sk_dst_pending_confirm = 0;
 598                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 599                 dst_release(dst);
 600                 return NULL;
 601         }
 602
 603         return dst;
 604 }
 605 EXPORT_SYMBOL(__sk_dst_check);
 606
 607 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 608 {
 609         struct dst_entry *dst = sk_dst_get(sk);
 610
 611         if (dst && dst->obsolete &&
 612             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 613                                dst, cookie) == NULL) {
 614                 sk_dst_reset(sk);
 615                 dst_release(dst);
 616                 return NULL;
 617         }
 618
 619         return dst;
 620 }
 621 EXPORT_SYMBOL(sk_dst_check);
 622
 623 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 624 {
 625         int ret = -ENOPROTOOPT;
 626 #ifdef CONFIG_NETDEVICES
 627         struct net *net = sock_net(sk);
 628
 629         /* Sorry... */
 630         ret = -EPERM;
 631         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 632                 goto out;
 633
 634         ret = -EINVAL;
 635         if (ifindex < 0)
 636                 goto out;
 637
 638         /* Paired with all READ_ONCE() done locklessly. */
 639         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 640
 641         if (sk->sk_prot->rehash)
 642                 sk->sk_prot->rehash(sk);
 643         sk_dst_reset(sk);
 644
 645         ret = 0;
 646
 647 out:
 648 #endif
 649
 650         return ret;
 651 }
 652
 653 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 654 {
 655         int ret;
 656
 657         if (lock_sk)
 658                 lock_sock(sk);
 659         ret = sock_bindtoindex_locked(sk, ifindex);
 660         if (lock_sk)
 661                 release_sock(sk);
 662
 663         return ret;
 664 }
 665 EXPORT_SYMBOL(sock_bindtoindex);
 666
 667 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 668 {
 669         int ret = -ENOPROTOOPT;
 670 #ifdef CONFIG_NETDEVICES
 671         struct net *net = sock_net(sk);
 672         char devname[IFNAMSIZ];
 673         int index;
 674
 675         ret = -EINVAL;
 676         if (optlen < 0)
 677                 goto out;
 678
 679         /* Bind this socket to a particular device like "eth0",
 680          * as specified in the passed interface name. If the
 681          * name is "" or the option length is zero the socket
 682          * is not bound.
 683          */
 684         if (optlen > IFNAMSIZ - 1)
 685                 optlen = IFNAMSIZ - 1;
 686         memset(devname, 0, sizeof(devname));
 687
 688         ret = -EFAULT;
 689         if (copy_from_sockptr(devname, optval, optlen))
 690                 goto out;
 691
 692         index = 0;
 693         if (devname[0] != '\0') {
 694                 struct net_device *dev;
 695
 696                 rcu_read_lock();
 697                 dev = dev_get_by_name_rcu(net, devname);
 698                 if (dev)
 699                         index = dev->ifindex;
 700                 rcu_read_unlock();
 701                 ret = -ENODEV;
 702                 if (!dev)
 703                         goto out;
 704         }
 705
 706         return sock_bindtoindex(sk, index, true);
 707 out:
 708 #endif
 709
 710         return ret;
 711 }
 712
 713 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 714                                 int __user *optlen, int len)
 715 {
 716         int ret = -ENOPROTOOPT;
 717 #ifdef CONFIG_NETDEVICES
 718         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 719         struct net *net = sock_net(sk);
 720         char devname[IFNAMSIZ];
 721
 722         if (bound_dev_if == 0) {
 723                 len = 0;
 724                 goto zero;
 725         }
 726
 727         ret = -EINVAL;
 728         if (len < IFNAMSIZ)
 729                 goto out;
 730
 731         ret = netdev_get_name(net, devname, bound_dev_if);
 732         if (ret)
 733                 goto out;
 734
 735         len = strlen(devname) + 1;
 736
 737         ret = -EFAULT;
 738         if (copy_to_user(optval, devname, len))
 739                 goto out;
 740
 741 zero:
 742         ret = -EFAULT;
 743         if (put_user(len, optlen))
 744                 goto out;
 745
 746         ret = 0;
 747
 748 out:
 749 #endif
 750
 751         return ret;
 752 }
 753
 754 bool sk_mc_loop(struct sock *sk)
 755 {
 756         if (dev_recursion_level())
 757                 return false;
 758         if (!sk)
 759                 return true;
 760         switch (sk->sk_family) {
 761         case AF_INET:
 762                 return inet_sk(sk)->mc_loop;
 763 #if IS_ENABLED(CONFIG_IPV6)
 764         case AF_INET6:
 765                 return inet6_sk(sk)->mc_loop;
 766 #endif
 767         }
 768         WARN_ON_ONCE(1);
 769         return true;
 770 }
 771 EXPORT_SYMBOL(sk_mc_loop);
 772
 773 void sock_set_reuseaddr(struct sock *sk)
 774 {
 775         lock_sock(sk);
 776         sk->sk_reuse = SK_CAN_REUSE;
 777         release_sock(sk);
 778 }
 779 EXPORT_SYMBOL(sock_set_reuseaddr);
 780
 781 void sock_set_reuseport(struct sock *sk)
 782 {
 783         lock_sock(sk);
 784         sk->sk_reuseport = true;
 785         release_sock(sk);
 786 }
 787 EXPORT_SYMBOL(sock_set_reuseport);
 788
 789 void sock_no_linger(struct sock *sk)
 790 {
 791         lock_sock(sk);
 792         sk->sk_lingertime = 0;
 793         sock_set_flag(sk, SOCK_LINGER);
 794         release_sock(sk);
 795 }
 796 EXPORT_SYMBOL(sock_no_linger);
 797
 798 void sock_set_priority(struct sock *sk, u32 priority)
 799 {
 800         lock_sock(sk);
 801         sk->sk_priority = priority;
 802         release_sock(sk);
 803 }
 804 EXPORT_SYMBOL(sock_set_priority);
 805
 806 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 807 {
 808         lock_sock(sk);
 809         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 810                 sk->sk_sndtimeo = secs * HZ;
 811         else
 812                 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 813         release_sock(sk);
 814 }
 815 EXPORT_SYMBOL(sock_set_sndtimeo);
 816
 817 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 818 {
 819         if (val)  {
 820                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 821                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 822                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 823                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 824         } else {
 825                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 826                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 827         }
 828 }
 829
 830 void sock_enable_timestamps(struct sock *sk)
 831 {
 832         lock_sock(sk);
 833         __sock_set_timestamps(sk, true, false, true);
 834         release_sock(sk);
 835 }
 836 EXPORT_SYMBOL(sock_enable_timestamps);
 837
 838 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 839 {
 840         switch (optname) {
 841         case SO_TIMESTAMP_OLD:
 842                 __sock_set_timestamps(sk, valbool, false, false);
 843                 break;
 844         case SO_TIMESTAMP_NEW:
 845                 __sock_set_timestamps(sk, valbool, true, false);
 846                 break;
 847         case SO_TIMESTAMPNS_OLD:
 848                 __sock_set_timestamps(sk, valbool, false, true);
 849                 break;
 850         case SO_TIMESTAMPNS_NEW:
 851                 __sock_set_timestamps(sk, valbool, true, true);
 852                 break;
 853         }
 854 }
 855
 856 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 857 {
 858         struct net *net = sock_net(sk);
 859         struct net_device *dev = NULL;
 860         bool match = false;
 861         int *vclock_index;
 862         int i, num;
 863
 864         if (sk->sk_bound_dev_if)
 865                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 866
 867         if (!dev) {
 868                 pr_err("%s: sock not bind to device\n", __func__);
 869                 return -EOPNOTSUPP;
 870         }
 871
 872         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 873         dev_put(dev);
 874
 875         for (i = 0; i < num; i++) {
 876                 if (*(vclock_index + i) == phc_index) {
 877                         match = true;
 878                         break;
 879                 }
 880         }
 881
 882         if (num > 0)
 883                 kfree(vclock_index);
 884
 885         if (!match)
 886                 return -EINVAL;
 887
 888         sk->sk_bind_phc = phc_index;
 889
 890         return 0;
 891 }
 892
 893 int sock_set_timestamping(struct sock *sk, int optname,
 894                           struct so_timestamping timestamping)
 895 {
 896         int val = timestamping.flags;
 897         int ret;
 898
 899         if (val & ~SOF_TIMESTAMPING_MASK)
 900                 return -EINVAL;
 901
 902         if (val & SOF_TIMESTAMPING_OPT_ID &&
 903             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 904                 if (sk_is_tcp(sk)) {
 905                         if ((1 << sk->sk_state) &
 906                             (TCPF_CLOSE | TCPF_LISTEN))
 907                                 return -EINVAL;
 908                         atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 909                 } else {
 910                         atomic_set(&sk->sk_tskey, 0);
 911                 }
 912         }
 913
 914         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 915             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 916                 return -EINVAL;
 917
 918         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 919                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 920                 if (ret)
 921                         return ret;
 922         }
 923
 924         sk->sk_tsflags = val;
 925         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 926
 927         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 928                 sock_enable_timestamp(sk,
 929                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 930         else
 931                 sock_disable_timestamp(sk,
 932                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 933         return 0;
 934 }
 935
 936 void sock_set_keepalive(struct sock *sk)
 937 {
 938         lock_sock(sk);
 939         if (sk->sk_prot->keepalive)
 940                 sk->sk_prot->keepalive(sk, true);
 941         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 942         release_sock(sk);
 943 }
 944 EXPORT_SYMBOL(sock_set_keepalive);
 945
 946 static void __sock_set_rcvbuf(struct sock *sk, int val)
 947 {
 948         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 949          * as a negative value.
 950          */
 951         val = min_t(int, val, INT_MAX / 2);
 952         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 953
 954         /* We double it on the way in to account for "struct sk_buff" etc.
 955          * overhead.   Applications assume that the SO_RCVBUF setting they make
 956          * will allow that much actual data to be received on that socket.
 957          *
 958          * Applications are unaware that "struct sk_buff" and other overheads
 959          * allocate from the receive buffer during socket buffer allocation.
 960          *
 961          * And after considering the possible alternatives, returning the value
 962          * we actually used in getsockopt is the most desirable behavior.
 963          */
 964         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 965 }
 966
 967 void sock_set_rcvbuf(struct sock *sk, int val)
 968 {
 969         lock_sock(sk);
 970         __sock_set_rcvbuf(sk, val);
 971         release_sock(sk);
 972 }
 973 EXPORT_SYMBOL(sock_set_rcvbuf);
 974
 975 static void __sock_set_mark(struct sock *sk, u32 val)
 976 {
 977         if (val != sk->sk_mark) {
 978                 sk->sk_mark = val;
 979                 sk_dst_reset(sk);
 980         }
 981 }
 982
 983 void sock_set_mark(struct sock *sk, u32 val)
 984 {
 985         lock_sock(sk);
 986         __sock_set_mark(sk, val);
 987         release_sock(sk);
 988 }
 989 EXPORT_SYMBOL(sock_set_mark);
 990
 991 static void sock_release_reserved_memory(struct sock *sk, int bytes)
 992 {
 993         /* Round down bytes to multiple of pages */
 994         bytes = round_down(bytes, PAGE_SIZE);
 995
 996         WARN_ON(bytes > sk->sk_reserved_mem);
 997         sk->sk_reserved_mem -= bytes;
 998         sk_mem_reclaim(sk);
 999 }
1000
1001 static int sock_reserve_memory(struct sock *sk, int bytes)
1002 {
1003         long allocated;
1004         bool charged;
1005         int pages;
1006
1007         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1008                 return -EOPNOTSUPP;
1009
1010         if (!bytes)
1011                 return 0;
1012
1013         pages = sk_mem_pages(bytes);
1014
1015         /* pre-charge to memcg */
1016         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1017                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1018         if (!charged)
1019                 return -ENOMEM;
1020
1021         /* pre-charge to forward_alloc */
1022         sk_memory_allocated_add(sk, pages);
1023         allocated = sk_memory_allocated(sk);
1024         /* If the system goes into memory pressure with this
1025          * precharge, give up and return error.
1026          */
1027         if (allocated > sk_prot_mem_limits(sk, 1)) {
1028                 sk_memory_allocated_sub(sk, pages);
1029                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1030                 return -ENOMEM;
1031         }
1032         sk->sk_forward_alloc += pages << PAGE_SHIFT;
1033
1034         sk->sk_reserved_mem += pages << PAGE_SHIFT;
1035
1036         return 0;
1037 }
1038
1039 /*
1040  *      This is meant for all protocols to use and covers goings on
1041  *      at the socket level. Everything here is generic.
1042  */
1043
1044 int sock_setsockopt(struct socket *sock, int level, int optname,
1045                     sockptr_t optval, unsigned int optlen)
1046 {
1047         struct so_timestamping timestamping;
1048         struct sock_txtime sk_txtime;
1049         struct sock *sk = sock->sk;
1050         int val;
1051         int valbool;
1052         struct linger ling;
1053         int ret = 0;
1054
1055         /*
1056          *      Options without arguments
1057          */
1058
1059         if (optname == SO_BINDTODEVICE)
1060                 return sock_setbindtodevice(sk, optval, optlen);
1061
1062         if (optlen < sizeof(int))
1063                 return -EINVAL;
1064
1065         if (copy_from_sockptr(&val, optval, sizeof(val)))
1066                 return -EFAULT;
1067
1068         valbool = val ? 1 : 0;
1069
1070         lock_sock(sk);
1071
1072         switch (optname) {
1073         case SO_DEBUG:
1074                 if (val && !capable(CAP_NET_ADMIN))
1075                         ret = -EACCES;
1076                 else
1077                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1078                 break;
1079         case SO_REUSEADDR:
1080                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1081                 break;
1082         case SO_REUSEPORT:
1083                 sk->sk_reuseport = valbool;
1084                 break;
1085         case SO_TYPE:
1086         case SO_PROTOCOL:
1087         case SO_DOMAIN:
1088         case SO_ERROR:
1089                 ret = -ENOPROTOOPT;
1090                 break;
1091         case SO_DONTROUTE:
1092                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1093                 sk_dst_reset(sk);
1094                 break;
1095         case SO_BROADCAST:
1096                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1097                 break;
1098         case SO_SNDBUF:
1099                 /* Don't error on this BSD doesn't and if you think
1100                  * about it this is right. Otherwise apps have to
1101                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1102                  * are treated in BSD as hints
1103                  */
1104                 val = min_t(u32, val, sysctl_wmem_max);
1105 set_sndbuf:
1106                 /* Ensure val * 2 fits into an int, to prevent max_t()
1107                  * from treating it as a negative value.
1108                  */
1109                 val = min_t(int, val, INT_MAX / 2);
1110                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1111                 WRITE_ONCE(sk->sk_sndbuf,
1112                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1113                 /* Wake up sending tasks if we upped the value. */
1114                 sk->sk_write_space(sk);
1115                 break;
1116
1117         case SO_SNDBUFFORCE:
1118                 if (!capable(CAP_NET_ADMIN)) {
1119                         ret = -EPERM;
1120                         break;
1121                 }
1122
1123                 /* No negative values (to prevent underflow, as val will be
1124                  * multiplied by 2).
1125                  */
1126                 if (val < 0)
1127                         val = 0;
1128                 goto set_sndbuf;
1129
1130         case SO_RCVBUF:
1131                 /* Don't error on this BSD doesn't and if you think
1132                  * about it this is right. Otherwise apps have to
1133                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1134                  * are treated in BSD as hints
1135                  */
1136                 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1137                 break;
1138
1139         case SO_RCVBUFFORCE:
1140                 if (!capable(CAP_NET_ADMIN)) {
1141                         ret = -EPERM;
1142                         break;
1143                 }
1144
1145                 /* No negative values (to prevent underflow, as val will be
1146                  * multiplied by 2).
1147                  */
1148                 __sock_set_rcvbuf(sk, max(val, 0));
1149                 break;
1150
1151         case SO_KEEPALIVE:
1152                 if (sk->sk_prot->keepalive)
1153                         sk->sk_prot->keepalive(sk, valbool);
1154                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1155                 break;
1156
1157         case SO_OOBINLINE:
1158                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1159                 break;
1160
1161         case SO_NO_CHECK:
1162                 sk->sk_no_check_tx = valbool;
1163                 break;
1164
1165         case SO_PRIORITY:
1166                 if ((val >= 0 && val <= 6) ||
1167                     ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1168                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1169                         sk->sk_priority = val;
1170                 else
1171                         ret = -EPERM;
1172                 break;
1173
1174         case SO_LINGER:
1175                 if (optlen < sizeof(ling)) {
1176                         ret = -EINVAL;  /* 1003.1g */
1177                         break;
1178                 }
1179                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1180                         ret = -EFAULT;
1181                         break;
1182                 }
1183                 if (!ling.l_onoff)
1184                         sock_reset_flag(sk, SOCK_LINGER);
1185                 else {
1186 #if (BITS_PER_LONG == 32)
1187                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1188                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1189                         else
1190 #endif
1191                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1192                         sock_set_flag(sk, SOCK_LINGER);
1193                 }
1194                 break;
1195
1196         case SO_BSDCOMPAT:
1197                 break;
1198
1199         case SO_PASSCRED:
1200                 if (valbool)
1201                         set_bit(SOCK_PASSCRED, &sock->flags);
1202                 else
1203                         clear_bit(SOCK_PASSCRED, &sock->flags);
1204                 break;
1205
1206         case SO_TIMESTAMP_OLD:
1207         case SO_TIMESTAMP_NEW:
1208         case SO_TIMESTAMPNS_OLD:
1209         case SO_TIMESTAMPNS_NEW:
1210                 sock_set_timestamp(sk, optname, valbool);
1211                 break;
1212
1213         case SO_TIMESTAMPING_NEW:
1214         case SO_TIMESTAMPING_OLD:
1215                 if (optlen == sizeof(timestamping)) {
1216                         if (copy_from_sockptr(&timestamping, optval,
1217                                               sizeof(timestamping))) {
1218                                 ret = -EFAULT;
1219                                 break;
1220                         }
1221                 } else {
1222                         memset(&timestamping, 0, sizeof(timestamping));
1223                         timestamping.flags = val;
1224                 }
1225                 ret = sock_set_timestamping(sk, optname, timestamping);
1226                 break;
1227
1228         case SO_RCVLOWAT:
1229                 if (val < 0)
1230                         val = INT_MAX;
1231                 if (sock->ops->set_rcvlowat)
1232                         ret = sock->ops->set_rcvlowat(sk, val);
1233                 else
1234                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1235                 break;
1236
1237         case SO_RCVTIMEO_OLD:
1238         case SO_RCVTIMEO_NEW:
1239                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1240                                        optlen, optname == SO_RCVTIMEO_OLD);
1241                 break;
1242
1243         case SO_SNDTIMEO_OLD:
1244         case SO_SNDTIMEO_NEW:
1245                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1246                                        optlen, optname == SO_SNDTIMEO_OLD);
1247                 break;
1248
1249         case SO_ATTACH_FILTER: {
1250                 struct sock_fprog fprog;
1251
1252                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1253                 if (!ret)
1254                         ret = sk_attach_filter(&fprog, sk);
1255                 break;
1256         }
1257         case SO_ATTACH_BPF:
1258                 ret = -EINVAL;
1259                 if (optlen == sizeof(u32)) {
1260                         u32 ufd;
1261
1262                         ret = -EFAULT;
1263                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1264                                 break;
1265
1266                         ret = sk_attach_bpf(ufd, sk);
1267                 }
1268                 break;
1269
1270         case SO_ATTACH_REUSEPORT_CBPF: {
1271                 struct sock_fprog fprog;
1272
1273                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1274                 if (!ret)
1275                         ret = sk_reuseport_attach_filter(&fprog, sk);
1276                 break;
1277         }
1278         case SO_ATTACH_REUSEPORT_EBPF:
1279                 ret = -EINVAL;
1280                 if (optlen == sizeof(u32)) {
1281                         u32 ufd;
1282
1283                         ret = -EFAULT;
1284                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1285                                 break;
1286
1287                         ret = sk_reuseport_attach_bpf(ufd, sk);
1288                 }
1289                 break;
1290
1291         case SO_DETACH_REUSEPORT_BPF:
1292                 ret = reuseport_detach_prog(sk);
1293                 break;
1294
1295         case SO_DETACH_FILTER:
1296                 ret = sk_detach_filter(sk);
1297                 break;
1298
1299         case SO_LOCK_FILTER:
1300                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1301                         ret = -EPERM;
1302                 else
1303                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1304                 break;
1305
1306         case SO_PASSSEC:
1307                 if (valbool)
1308                         set_bit(SOCK_PASSSEC, &sock->flags);
1309                 else
1310                         clear_bit(SOCK_PASSSEC, &sock->flags);
1311                 break;
1312         case SO_MARK:
1313                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1314                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1315                         ret = -EPERM;
1316                         break;
1317                 }
1318
1319                 __sock_set_mark(sk, val);
1320                 break;
1321         case SO_RCVMARK:
1322                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1323                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1324                         ret = -EPERM;
1325                         break;
1326                 }
1327
1328                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1329                 break;
1330
1331         case SO_RXQ_OVFL:
1332                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1333                 break;
1334
1335         case SO_WIFI_STATUS:
1336                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1337                 break;
1338
1339         case SO_PEEK_OFF:
1340                 if (sock->ops->set_peek_off)
1341                         ret = sock->ops->set_peek_off(sk, val);
1342                 else
1343                         ret = -EOPNOTSUPP;
1344                 break;
1345
1346         case SO_NOFCS:
1347                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1348                 break;
1349
1350         case SO_SELECT_ERR_QUEUE:
1351                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1352                 break;
1353
1354 #ifdef CONFIG_NET_RX_BUSY_POLL
1355         case SO_BUSY_POLL:
1356                 /* allow unprivileged users to decrease the value */
1357                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1358                         ret = -EPERM;
1359                 else {
1360                         if (val < 0)
1361                                 ret = -EINVAL;
1362                         else
1363                                 WRITE_ONCE(sk->sk_ll_usec, val);
1364                 }
1365                 break;
1366         case SO_PREFER_BUSY_POLL:
1367                 if (valbool && !capable(CAP_NET_ADMIN))
1368                         ret = -EPERM;
1369                 else
1370                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1371                 break;
1372         case SO_BUSY_POLL_BUDGET:
1373                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1374                         ret = -EPERM;
1375                 } else {
1376                         if (val < 0 || val > U16_MAX)
1377                                 ret = -EINVAL;
1378                         else
1379                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1380                 }
1381                 break;
1382 #endif
1383
1384         case SO_MAX_PACING_RATE:
1385                 {
1386                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1387
1388                 if (sizeof(ulval) != sizeof(val) &&
1389                     optlen >= sizeof(ulval) &&
1390                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1391                         ret = -EFAULT;
1392                         break;
1393                 }
1394                 if (ulval != ~0UL)
1395                         cmpxchg(&sk->sk_pacing_status,
1396                                 SK_PACING_NONE,
1397                                 SK_PACING_NEEDED);
1398                 sk->sk_max_pacing_rate = ulval;
1399                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1400                 break;
1401                 }
1402         case SO_INCOMING_CPU:
1403                 WRITE_ONCE(sk->sk_incoming_cpu, val);
1404                 break;
1405
1406         case SO_CNX_ADVICE:
1407                 if (val == 1)
1408                         dst_negative_advice(sk);
1409                 break;
1410
1411         case SO_ZEROCOPY:
1412                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1413                         if (!(sk_is_tcp(sk) ||
1414                               (sk->sk_type == SOCK_DGRAM &&
1415                                sk->sk_protocol == IPPROTO_UDP)))
1416                                 ret = -EOPNOTSUPP;
1417                 } else if (sk->sk_family != PF_RDS) {
1418                         ret = -EOPNOTSUPP;
1419                 }
1420                 if (!ret) {
1421                         if (val < 0 || val > 1)
1422                                 ret = -EINVAL;
1423                         else
1424                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1425                 }
1426                 break;
1427
1428         case SO_TXTIME:
1429                 if (optlen != sizeof(struct sock_txtime)) {
1430                         ret = -EINVAL;
1431                         break;
1432                 } else if (copy_from_sockptr(&sk_txtime, optval,
1433                            sizeof(struct sock_txtime))) {
1434                         ret = -EFAULT;
1435                         break;
1436                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1437                         ret = -EINVAL;
1438                         break;
1439                 }
1440                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1441                  * scheduler has enough safe guards.
1442                  */
1443                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1444                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1445                         ret = -EPERM;
1446                         break;
1447                 }
1448                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1449                 sk->sk_clockid = sk_txtime.clockid;
1450                 sk->sk_txtime_deadline_mode =
1451                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1452                 sk->sk_txtime_report_errors =
1453                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1454                 break;
1455
1456         case SO_BINDTOIFINDEX:
1457                 ret = sock_bindtoindex_locked(sk, val);
1458                 break;
1459
1460         case SO_BUF_LOCK:
1461                 if (val & ~SOCK_BUF_LOCK_MASK) {
1462                         ret = -EINVAL;
1463                         break;
1464                 }
1465                 sk->sk_userlocks = val | (sk->sk_userlocks &
1466                                           ~SOCK_BUF_LOCK_MASK);
1467                 break;
1468
1469         case SO_RESERVE_MEM:
1470         {
1471                 int delta;
1472
1473                 if (val < 0) {
1474                         ret = -EINVAL;
1475                         break;
1476                 }
1477
1478                 delta = val - sk->sk_reserved_mem;
1479                 if (delta < 0)
1480                         sock_release_reserved_memory(sk, -delta);
1481                 else
1482                         ret = sock_reserve_memory(sk, delta);
1483                 break;
1484         }
1485
1486         case SO_TXREHASH:
1487                 if (val < -1 || val > 1) {
1488                         ret = -EINVAL;
1489                         break;
1490                 }
1491                 /* Paired with READ_ONCE() in tcp_rtx_synack() */
1492                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1493                 break;
1494
1495         default:
1496                 ret = -ENOPROTOOPT;
1497                 break;
1498         }
1499         release_sock(sk);
1500         return ret;
1501 }
1502 EXPORT_SYMBOL(sock_setsockopt);
1503
1504 static const struct cred *sk_get_peer_cred(struct sock *sk)
1505 {
1506         const struct cred *cred;
1507
1508         spin_lock(&sk->sk_peer_lock);
1509         cred = get_cred(sk->sk_peer_cred);
1510         spin_unlock(&sk->sk_peer_lock);
1511
1512         return cred;
1513 }
1514
1515 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1516                           struct ucred *ucred)
1517 {
1518         ucred->pid = pid_vnr(pid);
1519         ucred->uid = ucred->gid = -1;
1520         if (cred) {
1521                 struct user_namespace *current_ns = current_user_ns();
1522
1523                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1524                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1525         }
1526 }
1527
1528 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1529 {
1530         struct user_namespace *user_ns = current_user_ns();
1531         int i;
1532
1533         for (i = 0; i < src->ngroups; i++)
1534                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1535                         return -EFAULT;
1536
1537         return 0;
1538 }
1539
1540 int sock_getsockopt(struct socket *sock, int level, int optname,
1541                     char __user *optval, int __user *optlen)
1542 {
1543         struct sock *sk = sock->sk;
1544
1545         union {
1546                 int val;
1547                 u64 val64;
1548                 unsigned long ulval;
1549                 struct linger ling;
1550                 struct old_timeval32 tm32;
1551                 struct __kernel_old_timeval tm;
1552                 struct  __kernel_sock_timeval stm;
1553                 struct sock_txtime txtime;
1554                 struct so_timestamping timestamping;
1555         } v;
1556
1557         int lv = sizeof(int);
1558         int len;
1559
1560         if (get_user(len, optlen))
1561                 return -EFAULT;
1562         if (len < 0)
1563                 return -EINVAL;
1564
1565         memset(&v, 0, sizeof(v));
1566
1567         switch (optname) {
1568         case SO_DEBUG:
1569                 v.val = sock_flag(sk, SOCK_DBG);
1570                 break;
1571
1572         case SO_DONTROUTE:
1573                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1574                 break;
1575
1576         case SO_BROADCAST:
1577                 v.val = sock_flag(sk, SOCK_BROADCAST);
1578                 break;
1579
1580         case SO_SNDBUF:
1581                 v.val = sk->sk_sndbuf;
1582                 break;
1583
1584         case SO_RCVBUF:
1585                 v.val = sk->sk_rcvbuf;
1586                 break;
1587
1588         case SO_REUSEADDR:
1589                 v.val = sk->sk_reuse;
1590                 break;
1591
1592         case SO_REUSEPORT:
1593                 v.val = sk->sk_reuseport;
1594                 break;
1595
1596         case SO_KEEPALIVE:
1597                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1598                 break;
1599
1600         case SO_TYPE:
1601                 v.val = sk->sk_type;
1602                 break;
1603
1604         case SO_PROTOCOL:
1605                 v.val = sk->sk_protocol;
1606                 break;
1607
1608         case SO_DOMAIN:
1609                 v.val = sk->sk_family;
1610                 break;
1611
1612         case SO_ERROR:
1613                 v.val = -sock_error(sk);
1614                 if (v.val == 0)
1615                         v.val = xchg(&sk->sk_err_soft, 0);
1616                 break;
1617
1618         case SO_OOBINLINE:
1619                 v.val = sock_flag(sk, SOCK_URGINLINE);
1620                 break;
1621
1622         case SO_NO_CHECK:
1623                 v.val = sk->sk_no_check_tx;
1624                 break;
1625
1626         case SO_PRIORITY:
1627                 v.val = sk->sk_priority;
1628                 break;
1629
1630         case SO_LINGER:
1631                 lv              = sizeof(v.ling);
1632                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1633                 v.ling.l_linger = sk->sk_lingertime / HZ;
1634                 break;
1635
1636         case SO_BSDCOMPAT:
1637                 break;
1638
1639         case SO_TIMESTAMP_OLD:
1640                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1641                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1642                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1643                 break;
1644
1645         case SO_TIMESTAMPNS_OLD:
1646                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1647                 break;
1648
1649         case SO_TIMESTAMP_NEW:
1650                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1651                 break;
1652
1653         case SO_TIMESTAMPNS_NEW:
1654                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1655                 break;
1656
1657         case SO_TIMESTAMPING_OLD:
1658                 lv = sizeof(v.timestamping);
1659                 v.timestamping.flags = sk->sk_tsflags;
1660                 v.timestamping.bind_phc = sk->sk_bind_phc;
1661                 break;
1662
1663         case SO_RCVTIMEO_OLD:
1664         case SO_RCVTIMEO_NEW:
1665                 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1666                 break;
1667
1668         case SO_SNDTIMEO_OLD:
1669         case SO_SNDTIMEO_NEW:
1670                 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1671                 break;
1672
1673         case SO_RCVLOWAT:
1674                 v.val = sk->sk_rcvlowat;
1675                 break;
1676
1677         case SO_SNDLOWAT:
1678                 v.val = 1;
1679                 break;
1680
1681         case SO_PASSCRED:
1682                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1683                 break;
1684
1685         case SO_PEERCRED:
1686         {
1687                 struct ucred peercred;
1688                 if (len > sizeof(peercred))
1689                         len = sizeof(peercred);
1690
1691                 spin_lock(&sk->sk_peer_lock);
1692                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1693                 spin_unlock(&sk->sk_peer_lock);
1694
1695                 if (copy_to_user(optval, &peercred, len))
1696                         return -EFAULT;
1697                 goto lenout;
1698         }
1699
1700         case SO_PEERGROUPS:
1701         {
1702                 const struct cred *cred;
1703                 int ret, n;
1704
1705                 cred = sk_get_peer_cred(sk);
1706                 if (!cred)
1707                         return -ENODATA;
1708
1709                 n = cred->group_info->ngroups;
1710                 if (len < n * sizeof(gid_t)) {
1711                         len = n * sizeof(gid_t);
1712                         put_cred(cred);
1713                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1714                 }
1715                 len = n * sizeof(gid_t);
1716
1717                 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1718                 put_cred(cred);
1719                 if (ret)
1720                         return ret;
1721                 goto lenout;
1722         }
1723
1724         case SO_PEERNAME:
1725         {
1726                 char address[128];
1727
1728                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1729                 if (lv < 0)
1730                         return -ENOTCONN;
1731                 if (lv < len)
1732                         return -EINVAL;
1733                 if (copy_to_user(optval, address, len))
1734                         return -EFAULT;
1735                 goto lenout;
1736         }
1737
1738         /* Dubious BSD thing... Probably nobody even uses it, but
1739          * the UNIX standard wants it for whatever reason... -DaveM
1740          */
1741         case SO_ACCEPTCONN:
1742                 v.val = sk->sk_state == TCP_LISTEN;
1743                 break;
1744
1745         case SO_PASSSEC:
1746                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1747                 break;
1748
1749         case SO_PEERSEC:
1750                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1751
1752         case SO_MARK:
1753                 v.val = sk->sk_mark;
1754                 break;
1755
1756         case SO_RCVMARK:
1757                 v.val = sock_flag(sk, SOCK_RCVMARK);
1758                 break;
1759
1760         case SO_RXQ_OVFL:
1761                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1762                 break;
1763
1764         case SO_WIFI_STATUS:
1765                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1766                 break;
1767
1768         case SO_PEEK_OFF:
1769                 if (!sock->ops->set_peek_off)
1770                         return -EOPNOTSUPP;
1771
1772                 v.val = sk->sk_peek_off;
1773                 break;
1774         case SO_NOFCS:
1775                 v.val = sock_flag(sk, SOCK_NOFCS);
1776                 break;
1777
1778         case SO_BINDTODEVICE:
1779                 return sock_getbindtodevice(sk, optval, optlen, len);
1780
1781         case SO_GET_FILTER:
1782                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1783                 if (len < 0)
1784                         return len;
1785
1786                 goto lenout;
1787
1788         case SO_LOCK_FILTER:
1789                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1790                 break;
1791
1792         case SO_BPF_EXTENSIONS:
1793                 v.val = bpf_tell_extensions();
1794                 break;
1795
1796         case SO_SELECT_ERR_QUEUE:
1797                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1798                 break;
1799
1800 #ifdef CONFIG_NET_RX_BUSY_POLL
1801         case SO_BUSY_POLL:
1802                 v.val = sk->sk_ll_usec;
1803                 break;
1804         case SO_PREFER_BUSY_POLL:
1805                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1806                 break;
1807 #endif
1808
1809         case SO_MAX_PACING_RATE:
1810                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1811                         lv = sizeof(v.ulval);
1812                         v.ulval = sk->sk_max_pacing_rate;
1813                 } else {
1814                         /* 32bit version */
1815                         v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1816                 }
1817                 break;
1818
1819         case SO_INCOMING_CPU:
1820                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1821                 break;
1822
1823         case SO_MEMINFO:
1824         {
1825                 u32 meminfo[SK_MEMINFO_VARS];
1826
1827                 sk_get_meminfo(sk, meminfo);
1828
1829                 len = min_t(unsigned int, len, sizeof(meminfo));
1830                 if (copy_to_user(optval, &meminfo, len))
1831                         return -EFAULT;
1832
1833                 goto lenout;
1834         }
1835
1836 #ifdef CONFIG_NET_RX_BUSY_POLL
1837         case SO_INCOMING_NAPI_ID:
1838                 v.val = READ_ONCE(sk->sk_napi_id);
1839
1840                 /* aggregate non-NAPI IDs down to 0 */
1841                 if (v.val < MIN_NAPI_ID)
1842                         v.val = 0;
1843
1844                 break;
1845 #endif
1846
1847         case SO_COOKIE:
1848                 lv = sizeof(u64);
1849                 if (len < lv)
1850                         return -EINVAL;
1851                 v.val64 = sock_gen_cookie(sk);
1852                 break;
1853
1854         case SO_ZEROCOPY:
1855                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1856                 break;
1857
1858         case SO_TXTIME:
1859                 lv = sizeof(v.txtime);
1860                 v.txtime.clockid = sk->sk_clockid;
1861                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1862                                   SOF_TXTIME_DEADLINE_MODE : 0;
1863                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1864                                   SOF_TXTIME_REPORT_ERRORS : 0;
1865                 break;
1866
1867         case SO_BINDTOIFINDEX:
1868                 v.val = READ_ONCE(sk->sk_bound_dev_if);
1869                 break;
1870
1871         case SO_NETNS_COOKIE:
1872                 lv = sizeof(u64);
1873                 if (len != lv)
1874                         return -EINVAL;
1875                 v.val64 = sock_net(sk)->net_cookie;
1876                 break;
1877
1878         case SO_BUF_LOCK:
1879                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1880                 break;
1881
1882         case SO_RESERVE_MEM:
1883                 v.val = sk->sk_reserved_mem;
1884                 break;
1885
1886         case SO_TXREHASH:
1887                 v.val = sk->sk_txrehash;
1888                 break;
1889
1890         default:
1891                 /* We implement the SO_SNDLOWAT etc to not be settable
1892                  * (1003.1g 7).
1893                  */
1894                 return -ENOPROTOOPT;
1895         }
1896
1897         if (len > lv)
1898                 len = lv;
1899         if (copy_to_user(optval, &v, len))
1900                 return -EFAULT;
1901 lenout:
1902         if (put_user(len, optlen))
1903                 return -EFAULT;
1904         return 0;
1905 }
1906
1907 /*
1908  * Initialize an sk_lock.
1909  *
1910  * (We also register the sk_lock with the lock validator.)
1911  */
1912 static inline void sock_lock_init(struct sock *sk)
1913 {
1914         if (sk->sk_kern_sock)
1915                 sock_lock_init_class_and_name(
1916                         sk,
1917                         af_family_kern_slock_key_strings[sk->sk_family],
1918                         af_family_kern_slock_keys + sk->sk_family,
1919                         af_family_kern_key_strings[sk->sk_family],
1920                         af_family_kern_keys + sk->sk_family);
1921         else
1922                 sock_lock_init_class_and_name(
1923                         sk,
1924                         af_family_slock_key_strings[sk->sk_family],
1925                         af_family_slock_keys + sk->sk_family,
1926                         af_family_key_strings[sk->sk_family],
1927                         af_family_keys + sk->sk_family);
1928 }
1929
1930 /*
1931  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1932  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1933  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1934  */
1935 static void sock_copy(struct sock *nsk, const struct sock *osk)
1936 {
1937         const struct proto *prot = READ_ONCE(osk->sk_prot);
1938 #ifdef CONFIG_SECURITY_NETWORK
1939         void *sptr = nsk->sk_security;
1940 #endif
1941
1942         /* If we move sk_tx_queue_mapping out of the private section,
1943          * we must check if sk_tx_queue_clear() is called after
1944          * sock_copy() in sk_clone_lock().
1945          */
1946         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1947                      offsetof(struct sock, sk_dontcopy_begin) ||
1948                      offsetof(struct sock, sk_tx_queue_mapping) >=
1949                      offsetof(struct sock, sk_dontcopy_end));
1950
1951         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1952
1953         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1954                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1955
1956 #ifdef CONFIG_SECURITY_NETWORK
1957         nsk->sk_security = sptr;
1958         security_sk_clone(osk, nsk);
1959 #endif
1960 }
1961
1962 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1963                 int family)
1964 {
1965         struct sock *sk;
1966         struct kmem_cache *slab;
1967
1968         slab = prot->slab;
1969         if (slab != NULL) {
1970                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1971                 if (!sk)
1972                         return sk;
1973                 if (want_init_on_alloc(priority))
1974                         sk_prot_clear_nulls(sk, prot->obj_size);
1975         } else
1976                 sk = kmalloc(prot->obj_size, priority);
1977
1978         if (sk != NULL) {
1979                 if (security_sk_alloc(sk, family, priority))
1980                         goto out_free;
1981
1982                 if (!try_module_get(prot->owner))
1983                         goto out_free_sec;
1984         }
1985
1986         return sk;
1987
1988 out_free_sec:
1989         security_sk_free(sk);
1990 out_free:
1991         if (slab != NULL)
1992                 kmem_cache_free(slab, sk);
1993         else
1994                 kfree(sk);
1995         return NULL;
1996 }
1997
1998 static void sk_prot_free(struct proto *prot, struct sock *sk)
1999 {
2000         struct kmem_cache *slab;
2001         struct module *owner;
2002
2003         owner = prot->owner;
2004         slab = prot->slab;
2005
2006         cgroup_sk_free(&sk->sk_cgrp_data);
2007         mem_cgroup_sk_free(sk);
2008         security_sk_free(sk);
2009         if (slab != NULL)
2010                 kmem_cache_free(slab, sk);
2011         else
2012                 kfree(sk);
2013         module_put(owner);
2014 }
2015
2016 /**
2017  *      sk_alloc - All socket objects are allocated here
2018  *      @net: the applicable net namespace
2019  *      @family: protocol family
2020  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2021  *      @prot: struct proto associated with this new sock instance
2022  *      @kern: is this to be a kernel socket?
2023  */
2024 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2025                       struct proto *prot, int kern)
2026 {
2027         struct sock *sk;
2028
2029         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2030         if (sk) {
2031                 sk->sk_family = family;
2032                 /*
2033                  * See comment in struct sock definition to understand
2034                  * why we need sk_prot_creator -acme
2035                  */
2036                 sk->sk_prot = sk->sk_prot_creator = prot;
2037                 sk->sk_kern_sock = kern;
2038                 sock_lock_init(sk);
2039                 sk->sk_net_refcnt = kern ? 0 : 1;
2040                 if (likely(sk->sk_net_refcnt)) {
2041                         get_net_track(net, &sk->ns_tracker, priority);
2042                         sock_inuse_add(net, 1);
2043                 }
2044
2045                 sock_net_set(sk, net);
2046                 refcount_set(&sk->sk_wmem_alloc, 1);
2047
2048                 mem_cgroup_sk_alloc(sk);
2049                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2050                 sock_update_classid(&sk->sk_cgrp_data);
2051                 sock_update_netprioidx(&sk->sk_cgrp_data);
2052                 sk_tx_queue_clear(sk);
2053         }
2054
2055         return sk;
2056 }
2057 EXPORT_SYMBOL(sk_alloc);
2058
2059 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2060  * grace period. This is the case for UDP sockets and TCP listeners.
2061  */
2062 static void __sk_destruct(struct rcu_head *head)
2063 {
2064         struct sock *sk = container_of(head, struct sock, sk_rcu);
2065         struct sk_filter *filter;
2066
2067         if (sk->sk_destruct)
2068                 sk->sk_destruct(sk);
2069
2070         filter = rcu_dereference_check(sk->sk_filter,
2071                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2072         if (filter) {
2073                 sk_filter_uncharge(sk, filter);
2074                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2075         }
2076
2077         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2078
2079 #ifdef CONFIG_BPF_SYSCALL
2080         bpf_sk_storage_free(sk);
2081 #endif
2082
2083         if (atomic_read(&sk->sk_omem_alloc))
2084                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2085                          __func__, atomic_read(&sk->sk_omem_alloc));
2086
2087         if (sk->sk_frag.page) {
2088                 put_page(sk->sk_frag.page);
2089                 sk->sk_frag.page = NULL;
2090         }
2091
2092         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2093         put_cred(sk->sk_peer_cred);
2094         put_pid(sk->sk_peer_pid);
2095
2096         if (likely(sk->sk_net_refcnt))
2097                 put_net_track(sock_net(sk), &sk->ns_tracker);
2098         sk_prot_free(sk->sk_prot_creator, sk);
2099 }
2100
2101 void sk_destruct(struct sock *sk)
2102 {
2103         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2104
2105         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2106                 reuseport_detach_sock(sk);
2107                 use_call_rcu = true;
2108         }
2109
2110         if (use_call_rcu)
2111                 call_rcu(&sk->sk_rcu, __sk_destruct);
2112         else
2113                 __sk_destruct(&sk->sk_rcu);
2114 }
2115
2116 static void __sk_free(struct sock *sk)
2117 {
2118         if (likely(sk->sk_net_refcnt))
2119                 sock_inuse_add(sock_net(sk), -1);
2120
2121         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2122                 sock_diag_broadcast_destroy(sk);
2123         else
2124                 sk_destruct(sk);
2125 }
2126
2127 void sk_free(struct sock *sk)
2128 {
2129         /*
2130          * We subtract one from sk_wmem_alloc and can know if
2131          * some packets are still in some tx queue.
2132          * If not null, sock_wfree() will call __sk_free(sk) later
2133          */
2134         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2135                 __sk_free(sk);
2136 }
2137 EXPORT_SYMBOL(sk_free);
2138
2139 static void sk_init_common(struct sock *sk)
2140 {
2141         skb_queue_head_init(&sk->sk_receive_queue);
2142         skb_queue_head_init(&sk->sk_write_queue);
2143         skb_queue_head_init(&sk->sk_error_queue);
2144
2145         rwlock_init(&sk->sk_callback_lock);
2146         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2147                         af_rlock_keys + sk->sk_family,
2148                         af_family_rlock_key_strings[sk->sk_family]);
2149         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2150                         af_wlock_keys + sk->sk_family,
2151                         af_family_wlock_key_strings[sk->sk_family]);
2152         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2153                         af_elock_keys + sk->sk_family,
2154                         af_family_elock_key_strings[sk->sk_family]);
2155         lockdep_set_class_and_name(&sk->sk_callback_lock,
2156                         af_callback_keys + sk->sk_family,
2157                         af_family_clock_key_strings[sk->sk_family]);
2158 }
2159
2160 /**
2161  *      sk_clone_lock - clone a socket, and lock its clone
2162  *      @sk: the socket to clone
2163  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2164  *
2165  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2166  */
2167 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2168 {
2169         struct proto *prot = READ_ONCE(sk->sk_prot);
2170         struct sk_filter *filter;
2171         bool is_charged = true;
2172         struct sock *newsk;
2173
2174         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2175         if (!newsk)
2176                 goto out;
2177
2178         sock_copy(newsk, sk);
2179
2180         newsk->sk_prot_creator = prot;
2181
2182         /* SANITY */
2183         if (likely(newsk->sk_net_refcnt)) {
2184                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2185                 sock_inuse_add(sock_net(newsk), 1);
2186         }
2187         sk_node_init(&newsk->sk_node);
2188         sock_lock_init(newsk);
2189         bh_lock_sock(newsk);
2190         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2191         newsk->sk_backlog.len = 0;
2192
2193         atomic_set(&newsk->sk_rmem_alloc, 0);
2194
2195         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2196         refcount_set(&newsk->sk_wmem_alloc, 1);
2197
2198         atomic_set(&newsk->sk_omem_alloc, 0);
2199         sk_init_common(newsk);
2200
2201         newsk->sk_dst_cache     = NULL;
2202         newsk->sk_dst_pending_confirm = 0;
2203         newsk->sk_wmem_queued   = 0;
2204         newsk->sk_forward_alloc = 0;
2205         newsk->sk_reserved_mem  = 0;
2206         atomic_set(&newsk->sk_drops, 0);
2207         newsk->sk_send_head     = NULL;
2208         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2209         atomic_set(&newsk->sk_zckey, 0);
2210
2211         sock_reset_flag(newsk, SOCK_DONE);
2212
2213         /* sk->sk_memcg will be populated at accept() time */
2214         newsk->sk_memcg = NULL;
2215
2216         cgroup_sk_clone(&newsk->sk_cgrp_data);
2217
2218         rcu_read_lock();
2219         filter = rcu_dereference(sk->sk_filter);
2220         if (filter != NULL)
2221                 /* though it's an empty new sock, the charging may fail
2222                  * if sysctl_optmem_max was changed between creation of
2223                  * original socket and cloning
2224                  */
2225                 is_charged = sk_filter_charge(newsk, filter);
2226         RCU_INIT_POINTER(newsk->sk_filter, filter);
2227         rcu_read_unlock();
2228
2229         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2230                 /* We need to make sure that we don't uncharge the new
2231                  * socket if we couldn't charge it in the first place
2232                  * as otherwise we uncharge the parent's filter.
2233                  */
2234                 if (!is_charged)
2235                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2236                 sk_free_unlock_clone(newsk);
2237                 newsk = NULL;
2238                 goto out;
2239         }
2240         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2241
2242         if (bpf_sk_storage_clone(sk, newsk)) {
2243                 sk_free_unlock_clone(newsk);
2244                 newsk = NULL;
2245                 goto out;
2246         }
2247
2248         /* Clear sk_user_data if parent had the pointer tagged
2249          * as not suitable for copying when cloning.
2250          */
2251         if (sk_user_data_is_nocopy(newsk))
2252                 newsk->sk_user_data = NULL;
2253
2254         newsk->sk_err      = 0;
2255         newsk->sk_err_soft = 0;
2256         newsk->sk_priority = 0;
2257         newsk->sk_incoming_cpu = raw_smp_processor_id();
2258
2259         /* Before updating sk_refcnt, we must commit prior changes to memory
2260          * (Documentation/RCU/rculist_nulls.rst for details)
2261          */
2262         smp_wmb();
2263         refcount_set(&newsk->sk_refcnt, 2);
2264
2265         /* Increment the counter in the same struct proto as the master
2266          * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2267          * is the same as sk->sk_prot->socks, as this field was copied
2268          * with memcpy).
2269          *
2270          * This _changes_ the previous behaviour, where
2271          * tcp_create_openreq_child always was incrementing the
2272          * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2273          * to be taken into account in all callers. -acme
2274          */
2275         sk_refcnt_debug_inc(newsk);
2276         sk_set_socket(newsk, NULL);
2277         sk_tx_queue_clear(newsk);
2278         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2279
2280         if (newsk->sk_prot->sockets_allocated)
2281                 sk_sockets_allocated_inc(newsk);
2282
2283         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2284                 net_enable_timestamp();
2285 out:
2286         return newsk;
2287 }
2288 EXPORT_SYMBOL_GPL(sk_clone_lock);
2289
2290 void sk_free_unlock_clone(struct sock *sk)
2291 {
2292         /* It is still raw copy of parent, so invalidate
2293          * destructor and make plain sk_free() */
2294         sk->sk_destruct = NULL;
2295         bh_unlock_sock(sk);
2296         sk_free(sk);
2297 }
2298 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2299
2300 static void sk_trim_gso_size(struct sock *sk)
2301 {
2302         if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
2303                 return;
2304 #if IS_ENABLED(CONFIG_IPV6)
2305         if (sk->sk_family == AF_INET6 &&
2306             sk_is_tcp(sk) &&
2307             !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
2308                 return;
2309 #endif
2310         sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
2311 }
2312
2313 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2314 {
2315         u32 max_segs = 1;
2316
2317         sk_dst_set(sk, dst);
2318         sk->sk_route_caps = dst->dev->features;
2319         if (sk_is_tcp(sk))
2320                 sk->sk_route_caps |= NETIF_F_GSO;
2321         if (sk->sk_route_caps & NETIF_F_GSO)
2322                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2323         if (unlikely(sk->sk_gso_disabled))
2324                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2325         if (sk_can_gso(sk)) {
2326                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2327                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2328                 } else {
2329                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2330                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2331                         sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2332                         sk_trim_gso_size(sk);
2333                         sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2334                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2335                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2336                 }
2337         }
2338         sk->sk_gso_max_segs = max_segs;
2339 }
2340 EXPORT_SYMBOL_GPL(sk_setup_caps);
2341
2342 /*
2343  *      Simple resource managers for sockets.
2344  */
2345
2346
2347 /*
2348  * Write buffer destructor automatically called from kfree_skb.
2349  */
2350 void sock_wfree(struct sk_buff *skb)
2351 {
2352         struct sock *sk = skb->sk;
2353         unsigned int len = skb->truesize;
2354         bool free;
2355
2356         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2357                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2358                     sk->sk_write_space == sock_def_write_space) {
2359                         rcu_read_lock();
2360                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2361                         sock_def_write_space_wfree(sk);
2362                         rcu_read_unlock();
2363                         if (unlikely(free))
2364                                 __sk_free(sk);
2365                         return;
2366                 }
2367
2368                 /*
2369                  * Keep a reference on sk_wmem_alloc, this will be released
2370                  * after sk_write_space() call
2371                  */
2372                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2373                 sk->sk_write_space(sk);
2374                 len = 1;
2375         }
2376         /*
2377          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2378          * could not do because of in-flight packets
2379          */
2380         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2381                 __sk_free(sk);
2382 }
2383 EXPORT_SYMBOL(sock_wfree);
2384
2385 /* This variant of sock_wfree() is used by TCP,
2386  * since it sets SOCK_USE_WRITE_QUEUE.
2387  */
2388 void __sock_wfree(struct sk_buff *skb)
2389 {
2390         struct sock *sk = skb->sk;
2391
2392         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2393                 __sk_free(sk);
2394 }
2395
2396 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2397 {
2398         skb_orphan(skb);
2399         skb->sk = sk;
2400 #ifdef CONFIG_INET
2401         if (unlikely(!sk_fullsock(sk))) {
2402                 skb->destructor = sock_edemux;
2403                 sock_hold(sk);
2404                 return;
2405         }
2406 #endif
2407         skb->destructor = sock_wfree;
2408         skb_set_hash_from_sk(skb, sk);
2409         /*
2410          * We used to take a refcount on sk, but following operation
2411          * is enough to guarantee sk_free() wont free this sock until
2412          * all in-flight packets are completed
2413          */
2414         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2415 }
2416 EXPORT_SYMBOL(skb_set_owner_w);
2417
2418 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2419 {
2420 #ifdef CONFIG_TLS_DEVICE
2421         /* Drivers depend on in-order delivery for crypto offload,
2422          * partial orphan breaks out-of-order-OK logic.
2423          */
2424         if (skb->decrypted)
2425                 return false;
2426 #endif
2427         return (skb->destructor == sock_wfree ||
2428                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2429 }
2430
2431 /* This helper is used by netem, as it can hold packets in its
2432  * delay queue. We want to allow the owner socket to send more
2433  * packets, as if they were already TX completed by a typical driver.
2434  * But we also want to keep skb->sk set because some packet schedulers
2435  * rely on it (sch_fq for example).
2436  */
2437 void skb_orphan_partial(struct sk_buff *skb)
2438 {
2439         if (skb_is_tcp_pure_ack(skb))
2440                 return;
2441
2442         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2443                 return;
2444
2445         skb_orphan(skb);
2446 }
2447 EXPORT_SYMBOL(skb_orphan_partial);
2448
2449 /*
2450  * Read buffer destructor automatically called from kfree_skb.
2451  */
2452 void sock_rfree(struct sk_buff *skb)
2453 {
2454         struct sock *sk = skb->sk;
2455         unsigned int len = skb->truesize;
2456
2457         atomic_sub(len, &sk->sk_rmem_alloc);
2458         sk_mem_uncharge(sk, len);
2459 }
2460 EXPORT_SYMBOL(sock_rfree);
2461
2462 /*
2463  * Buffer destructor for skbs that are not used directly in read or write
2464  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2465  */
2466 void sock_efree(struct sk_buff *skb)
2467 {
2468         sock_put(skb->sk);
2469 }
2470 EXPORT_SYMBOL(sock_efree);
2471
2472 /* Buffer destructor for prefetch/receive path where reference count may
2473  * not be held, e.g. for listen sockets.
2474  */
2475 #ifdef CONFIG_INET
2476 void sock_pfree(struct sk_buff *skb)
2477 {
2478         if (sk_is_refcounted(skb->sk))
2479                 sock_gen_put(skb->sk);
2480 }
2481 EXPORT_SYMBOL(sock_pfree);
2482 #endif /* CONFIG_INET */
2483
2484 kuid_t sock_i_uid(struct sock *sk)
2485 {
2486         kuid_t uid;
2487
2488         read_lock_bh(&sk->sk_callback_lock);
2489         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2490         read_unlock_bh(&sk->sk_callback_lock);
2491         return uid;
2492 }
2493 EXPORT_SYMBOL(sock_i_uid);
2494
2495 unsigned long sock_i_ino(struct sock *sk)
2496 {
2497         unsigned long ino;
2498
2499         read_lock_bh(&sk->sk_callback_lock);
2500         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2501         read_unlock_bh(&sk->sk_callback_lock);
2502         return ino;
2503 }
2504 EXPORT_SYMBOL(sock_i_ino);
2505
2506 /*
2507  * Allocate a skb from the socket's send buffer.
2508  */
2509 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2510                              gfp_t priority)
2511 {
2512         if (force ||
2513             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2514                 struct sk_buff *skb = alloc_skb(size, priority);
2515
2516                 if (skb) {
2517                         skb_set_owner_w(skb, sk);
2518                         return skb;
2519                 }
2520         }
2521         return NULL;
2522 }
2523 EXPORT_SYMBOL(sock_wmalloc);
2524
2525 static void sock_ofree(struct sk_buff *skb)
2526 {
2527         struct sock *sk = skb->sk;
2528
2529         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2530 }
2531
2532 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2533                              gfp_t priority)
2534 {
2535         struct sk_buff *skb;
2536
2537         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2538         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2539             sysctl_optmem_max)
2540                 return NULL;
2541
2542         skb = alloc_skb(size, priority);
2543         if (!skb)
2544                 return NULL;
2545
2546         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2547         skb->sk = sk;
2548         skb->destructor = sock_ofree;
2549         return skb;
2550 }
2551
2552 /*
2553  * Allocate a memory block from the socket's option memory buffer.
2554  */
2555 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2556 {
2557         if ((unsigned int)size <= sysctl_optmem_max &&
2558             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2559                 void *mem;
2560                 /* First do the add, to avoid the race if kmalloc
2561                  * might sleep.
2562                  */
2563                 atomic_add(size, &sk->sk_omem_alloc);
2564                 mem = kmalloc(size, priority);
2565                 if (mem)
2566                         return mem;
2567                 atomic_sub(size, &sk->sk_omem_alloc);
2568         }
2569         return NULL;
2570 }
2571 EXPORT_SYMBOL(sock_kmalloc);
2572
2573 /* Free an option memory block. Note, we actually want the inline
2574  * here as this allows gcc to detect the nullify and fold away the
2575  * condition entirely.
2576  */
2577 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2578                                   const bool nullify)
2579 {
2580         if (WARN_ON_ONCE(!mem))
2581                 return;
2582         if (nullify)
2583                 kfree_sensitive(mem);
2584         else
2585                 kfree(mem);
2586         atomic_sub(size, &sk->sk_omem_alloc);
2587 }
2588
2589 void sock_kfree_s(struct sock *sk, void *mem, int size)
2590 {
2591         __sock_kfree_s(sk, mem, size, false);
2592 }
2593 EXPORT_SYMBOL(sock_kfree_s);
2594
2595 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2596 {
2597         __sock_kfree_s(sk, mem, size, true);
2598 }
2599 EXPORT_SYMBOL(sock_kzfree_s);
2600
2601 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2602    I think, these locks should be removed for datagram sockets.
2603  */
2604 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2605 {
2606         DEFINE_WAIT(wait);
2607
2608         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2609         for (;;) {
2610                 if (!timeo)
2611                         break;
2612                 if (signal_pending(current))
2613                         break;
2614                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2615                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2616                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2617                         break;
2618                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2619                         break;
2620                 if (sk->sk_err)
2621                         break;
2622                 timeo = schedule_timeout(timeo);
2623         }
2624         finish_wait(sk_sleep(sk), &wait);
2625         return timeo;
2626 }
2627
2628
2629 /*
2630  *      Generic send/receive buffer handlers
2631  */
2632
2633 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2634                                      unsigned long data_len, int noblock,
2635                                      int *errcode, int max_page_order)
2636 {
2637         struct sk_buff *skb;
2638         long timeo;
2639         int err;
2640
2641         timeo = sock_sndtimeo(sk, noblock);
2642         for (;;) {
2643                 err = sock_error(sk);
2644                 if (err != 0)
2645                         goto failure;
2646
2647                 err = -EPIPE;
2648                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2649                         goto failure;
2650
2651                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2652                         break;
2653
2654                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2655                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2656                 err = -EAGAIN;
2657                 if (!timeo)
2658                         goto failure;
2659                 if (signal_pending(current))
2660                         goto interrupted;
2661                 timeo = sock_wait_for_wmem(sk, timeo);
2662         }
2663         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2664                                    errcode, sk->sk_allocation);
2665         if (skb)
2666                 skb_set_owner_w(skb, sk);
2667         return skb;
2668
2669 interrupted:
2670         err = sock_intr_errno(timeo);
2671 failure:
2672         *errcode = err;
2673         return NULL;
2674 }
2675 EXPORT_SYMBOL(sock_alloc_send_pskb);
2676
2677 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2678                      struct sockcm_cookie *sockc)
2679 {
2680         u32 tsflags;
2681
2682         switch (cmsg->cmsg_type) {
2683         case SO_MARK:
2684                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2685                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2686                         return -EPERM;
2687                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2688                         return -EINVAL;
2689                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2690                 break;
2691         case SO_TIMESTAMPING_OLD:
2692                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2693                         return -EINVAL;
2694
2695                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2696                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2697                         return -EINVAL;
2698
2699                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2700                 sockc->tsflags |= tsflags;
2701                 break;
2702         case SCM_TXTIME:
2703                 if (!sock_flag(sk, SOCK_TXTIME))
2704                         return -EINVAL;
2705                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2706                         return -EINVAL;
2707                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2708                 break;
2709         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2710         case SCM_RIGHTS:
2711         case SCM_CREDENTIALS:
2712                 break;
2713         default:
2714                 return -EINVAL;
2715         }
2716         return 0;
2717 }
2718 EXPORT_SYMBOL(__sock_cmsg_send);
2719
2720 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2721                    struct sockcm_cookie *sockc)
2722 {
2723         struct cmsghdr *cmsg;
2724         int ret;
2725
2726         for_each_cmsghdr(cmsg, msg) {
2727                 if (!CMSG_OK(msg, cmsg))
2728                         return -EINVAL;
2729                 if (cmsg->cmsg_level != SOL_SOCKET)
2730                         continue;
2731                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2732                 if (ret)
2733                         return ret;
2734         }
2735         return 0;
2736 }
2737 EXPORT_SYMBOL(sock_cmsg_send);
2738
2739 static void sk_enter_memory_pressure(struct sock *sk)
2740 {
2741         if (!sk->sk_prot->enter_memory_pressure)
2742                 return;
2743
2744         sk->sk_prot->enter_memory_pressure(sk);
2745 }
2746
2747 static void sk_leave_memory_pressure(struct sock *sk)
2748 {
2749         if (sk->sk_prot->leave_memory_pressure) {
2750                 sk->sk_prot->leave_memory_pressure(sk);
2751         } else {
2752                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2753
2754                 if (memory_pressure && READ_ONCE(*memory_pressure))
2755                         WRITE_ONCE(*memory_pressure, 0);
2756         }
2757 }
2758
2759 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2760
2761 /**
2762  * skb_page_frag_refill - check that a page_frag contains enough room
2763  * @sz: minimum size of the fragment we want to get
2764  * @pfrag: pointer to page_frag
2765  * @gfp: priority for memory allocation
2766  *
2767  * Note: While this allocator tries to use high order pages, there is
2768  * no guarantee that allocations succeed. Therefore, @sz MUST be
2769  * less or equal than PAGE_SIZE.
2770  */
2771 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2772 {
2773         if (pfrag->page) {
2774                 if (page_ref_count(pfrag->page) == 1) {
2775                         pfrag->offset = 0;
2776                         return true;
2777                 }
2778                 if (pfrag->offset + sz <= pfrag->size)
2779                         return true;
2780                 put_page(pfrag->page);
2781         }
2782
2783         pfrag->offset = 0;
2784         if (SKB_FRAG_PAGE_ORDER &&
2785             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2786                 /* Avoid direct reclaim but allow kswapd to wake */
2787                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2788                                           __GFP_COMP | __GFP_NOWARN |
2789                                           __GFP_NORETRY,
2790                                           SKB_FRAG_PAGE_ORDER);
2791                 if (likely(pfrag->page)) {
2792                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2793                         return true;
2794                 }
2795         }
2796         pfrag->page = alloc_page(gfp);
2797         if (likely(pfrag->page)) {
2798                 pfrag->size = PAGE_SIZE;
2799                 return true;
2800         }
2801         return false;
2802 }
2803 EXPORT_SYMBOL(skb_page_frag_refill);
2804
2805 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2806 {
2807         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2808                 return true;
2809
2810         sk_enter_memory_pressure(sk);
2811         sk_stream_moderate_sndbuf(sk);
2812         return false;
2813 }
2814 EXPORT_SYMBOL(sk_page_frag_refill);
2815
2816 void __lock_sock(struct sock *sk)
2817         __releases(&sk->sk_lock.slock)
2818         __acquires(&sk->sk_lock.slock)
2819 {
2820         DEFINE_WAIT(wait);
2821
2822         for (;;) {
2823                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2824                                         TASK_UNINTERRUPTIBLE);
2825                 spin_unlock_bh(&sk->sk_lock.slock);
2826                 schedule();
2827                 spin_lock_bh(&sk->sk_lock.slock);
2828                 if (!sock_owned_by_user(sk))
2829                         break;
2830         }
2831         finish_wait(&sk->sk_lock.wq, &wait);
2832 }
2833
2834 void __release_sock(struct sock *sk)
2835         __releases(&sk->sk_lock.slock)
2836         __acquires(&sk->sk_lock.slock)
2837 {
2838         struct sk_buff *skb, *next;
2839
2840         while ((skb = sk->sk_backlog.head) != NULL) {
2841                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2842
2843                 spin_unlock_bh(&sk->sk_lock.slock);
2844
2845                 do {
2846                         next = skb->next;
2847                         prefetch(next);
2848                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2849                         skb_mark_not_on_list(skb);
2850                         sk_backlog_rcv(sk, skb);
2851
2852                         cond_resched();
2853
2854                         skb = next;
2855                 } while (skb != NULL);
2856
2857                 spin_lock_bh(&sk->sk_lock.slock);
2858         }
2859
2860         /*
2861          * Doing the zeroing here guarantee we can not loop forever
2862          * while a wild producer attempts to flood us.
2863          */
2864         sk->sk_backlog.len = 0;
2865 }
2866
2867 void __sk_flush_backlog(struct sock *sk)
2868 {
2869         spin_lock_bh(&sk->sk_lock.slock);
2870         __release_sock(sk);
2871         spin_unlock_bh(&sk->sk_lock.slock);
2872 }
2873 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2874
2875 /**
2876  * sk_wait_data - wait for data to arrive at sk_receive_queue
2877  * @sk:    sock to wait on
2878  * @timeo: for how long
2879  * @skb:   last skb seen on sk_receive_queue
2880  *
2881  * Now socket state including sk->sk_err is changed only under lock,
2882  * hence we may omit checks after joining wait queue.
2883  * We check receive queue before schedule() only as optimization;
2884  * it is very likely that release_sock() added new data.
2885  */
2886 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2887 {
2888         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2889         int rc;
2890
2891         add_wait_queue(sk_sleep(sk), &wait);
2892         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2893         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2894         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2895         remove_wait_queue(sk_sleep(sk), &wait);
2896         return rc;
2897 }
2898 EXPORT_SYMBOL(sk_wait_data);
2899
2900 /**
2901  *      __sk_mem_raise_allocated - increase memory_allocated
2902  *      @sk: socket
2903  *      @size: memory size to allocate
2904  *      @amt: pages to allocate
2905  *      @kind: allocation type
2906  *
2907  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2908  */
2909 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2910 {
2911         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2912         struct proto *prot = sk->sk_prot;
2913         bool charged = true;
2914         long allocated;
2915
2916         sk_memory_allocated_add(sk, amt);
2917         allocated = sk_memory_allocated(sk);
2918         if (memcg_charge &&
2919             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2920                                                 gfp_memcg_charge())))
2921                 goto suppress_allocation;
2922
2923         /* Under limit. */
2924         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2925                 sk_leave_memory_pressure(sk);
2926                 return 1;
2927         }
2928
2929         /* Under pressure. */
2930         if (allocated > sk_prot_mem_limits(sk, 1))
2931                 sk_enter_memory_pressure(sk);
2932
2933         /* Over hard limit. */
2934         if (allocated > sk_prot_mem_limits(sk, 2))
2935                 goto suppress_allocation;
2936
2937         /* guarantee minimum buffer size under pressure */
2938         if (kind == SK_MEM_RECV) {
2939                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2940                         return 1;
2941
2942         } else { /* SK_MEM_SEND */
2943                 int wmem0 = sk_get_wmem0(sk, prot);
2944
2945                 if (sk->sk_type == SOCK_STREAM) {
2946                         if (sk->sk_wmem_queued < wmem0)
2947                                 return 1;
2948                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2949                                 return 1;
2950                 }
2951         }
2952
2953         if (sk_has_memory_pressure(sk)) {
2954                 u64 alloc;
2955
2956                 if (!sk_under_memory_pressure(sk))
2957                         return 1;
2958                 alloc = sk_sockets_allocated_read_positive(sk);
2959                 if (sk_prot_mem_limits(sk, 2) > alloc *
2960                     sk_mem_pages(sk->sk_wmem_queued +
2961                                  atomic_read(&sk->sk_rmem_alloc) +
2962                                  sk->sk_forward_alloc))
2963                         return 1;
2964         }
2965
2966 suppress_allocation:
2967
2968         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2969                 sk_stream_moderate_sndbuf(sk);
2970
2971                 /* Fail only if socket is _under_ its sndbuf.
2972                  * In this case we cannot block, so that we have to fail.
2973                  */
2974                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2975                         /* Force charge with __GFP_NOFAIL */
2976                         if (memcg_charge && !charged) {
2977                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2978                                         gfp_memcg_charge() | __GFP_NOFAIL);
2979                         }
2980                         return 1;
2981                 }
2982         }
2983
2984         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2985                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2986
2987         sk_memory_allocated_sub(sk, amt);
2988
2989         if (memcg_charge && charged)
2990                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2991
2992         return 0;
2993 }
2994
2995 /**
2996  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2997  *      @sk: socket
2998  *      @size: memory size to allocate
2999  *      @kind: allocation type
3000  *
3001  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3002  *      rmem allocation. This function assumes that protocols which have
3003  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3004  */
3005 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3006 {
3007         int ret, amt = sk_mem_pages(size);
3008
3009         sk->sk_forward_alloc += amt << PAGE_SHIFT;
3010         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3011         if (!ret)
3012                 sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3013         return ret;
3014 }
3015 EXPORT_SYMBOL(__sk_mem_schedule);
3016
3017 /**
3018  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3019  *      @sk: socket
3020  *      @amount: number of quanta
3021  *
3022  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3023  */
3024 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3025 {
3026         sk_memory_allocated_sub(sk, amount);
3027
3028         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3029                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3030
3031         if (sk_under_memory_pressure(sk) &&
3032             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3033                 sk_leave_memory_pressure(sk);
3034 }
3035
3036 /**
3037  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3038  *      @sk: socket
3039  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3040  */
3041 void __sk_mem_reclaim(struct sock *sk, int amount)
3042 {
3043         amount >>= PAGE_SHIFT;
3044         sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3045         __sk_mem_reduce_allocated(sk, amount);
3046 }
3047 EXPORT_SYMBOL(__sk_mem_reclaim);
3048
3049 int sk_set_peek_off(struct sock *sk, int val)
3050 {
3051         sk->sk_peek_off = val;
3052         return 0;
3053 }
3054 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3055
3056 /*
3057  * Set of default routines for initialising struct proto_ops when
3058  * the protocol does not support a particular function. In certain
3059  * cases where it makes no sense for a protocol to have a "do nothing"
3060  * function, some default processing is provided.
3061  */
3062
3063 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3064 {
3065         return -EOPNOTSUPP;
3066 }
3067 EXPORT_SYMBOL(sock_no_bind);
3068
3069 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3070                     int len, int flags)
3071 {
3072         return -EOPNOTSUPP;
3073 }
3074 EXPORT_SYMBOL(sock_no_connect);
3075
3076 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3077 {
3078         return -EOPNOTSUPP;
3079 }
3080 EXPORT_SYMBOL(sock_no_socketpair);
3081
3082 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3083                    bool kern)
3084 {
3085         return -EOPNOTSUPP;
3086 }
3087 EXPORT_SYMBOL(sock_no_accept);
3088
3089 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3090                     int peer)
3091 {
3092         return -EOPNOTSUPP;
3093 }
3094 EXPORT_SYMBOL(sock_no_getname);
3095
3096 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3097 {
3098         return -EOPNOTSUPP;
3099 }
3100 EXPORT_SYMBOL(sock_no_ioctl);
3101
3102 int sock_no_listen(struct socket *sock, int backlog)
3103 {
3104         return -EOPNOTSUPP;
3105 }
3106 EXPORT_SYMBOL(sock_no_listen);
3107
3108 int sock_no_shutdown(struct socket *sock, int how)
3109 {
3110         return -EOPNOTSUPP;
3111 }
3112 EXPORT_SYMBOL(sock_no_shutdown);
3113
3114 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3115 {
3116         return -EOPNOTSUPP;
3117 }
3118 EXPORT_SYMBOL(sock_no_sendmsg);
3119
3120 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3121 {
3122         return -EOPNOTSUPP;
3123 }
3124 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3125
3126 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3127                     int flags)
3128 {
3129         return -EOPNOTSUPP;
3130 }
3131 EXPORT_SYMBOL(sock_no_recvmsg);
3132
3133 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3134 {
3135         /* Mirror missing mmap method error code */
3136         return -ENODEV;
3137 }
3138 EXPORT_SYMBOL(sock_no_mmap);
3139
3140 /*
3141  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3142  * various sock-based usage counts.
3143  */
3144 void __receive_sock(struct file *file)
3145 {
3146         struct socket *sock;
3147
3148         sock = sock_from_file(file);
3149         if (sock) {
3150                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3151                 sock_update_classid(&sock->sk->sk_cgrp_data);
3152         }
3153 }
3154
3155 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3156 {
3157         ssize_t res;
3158         struct msghdr msg = {.msg_flags = flags};
3159         struct kvec iov;
3160         char *kaddr = kmap(page);
3161         iov.iov_base = kaddr + offset;
3162         iov.iov_len = size;
3163         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3164         kunmap(page);
3165         return res;
3166 }
3167 EXPORT_SYMBOL(sock_no_sendpage);
3168
3169 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3170                                 int offset, size_t size, int flags)
3171 {
3172         ssize_t res;
3173         struct msghdr msg = {.msg_flags = flags};
3174         struct kvec iov;
3175         char *kaddr = kmap(page);
3176
3177         iov.iov_base = kaddr + offset;
3178         iov.iov_len = size;
3179         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3180         kunmap(page);
3181         return res;
3182 }
3183 EXPORT_SYMBOL(sock_no_sendpage_locked);
3184
3185 /*
3186  *      Default Socket Callbacks
3187  */
3188
3189 static void sock_def_wakeup(struct sock *sk)
3190 {
3191         struct socket_wq *wq;
3192
3193         rcu_read_lock();
3194         wq = rcu_dereference(sk->sk_wq);
3195         if (skwq_has_sleeper(wq))
3196                 wake_up_interruptible_all(&wq->wait);
3197         rcu_read_unlock();
3198 }
3199
3200 static void sock_def_error_report(struct sock *sk)
3201 {
3202         struct socket_wq *wq;
3203
3204         rcu_read_lock();
3205         wq = rcu_dereference(sk->sk_wq);
3206         if (skwq_has_sleeper(wq))
3207                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3208         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3209         rcu_read_unlock();
3210 }
3211
3212 void sock_def_readable(struct sock *sk)
3213 {
3214         struct socket_wq *wq;
3215
3216         rcu_read_lock();
3217         wq = rcu_dereference(sk->sk_wq);
3218         if (skwq_has_sleeper(wq))
3219                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3220                                                 EPOLLRDNORM | EPOLLRDBAND);
3221         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3222         rcu_read_unlock();
3223 }
3224
3225 static void sock_def_write_space(struct sock *sk)
3226 {
3227         struct socket_wq *wq;
3228
3229         rcu_read_lock();
3230
3231         /* Do not wake up a writer until he can make "significant"
3232          * progress.  --DaveM
3233          */
3234         if (sock_writeable(sk)) {
3235                 wq = rcu_dereference(sk->sk_wq);
3236                 if (skwq_has_sleeper(wq))
3237                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3238                                                 EPOLLWRNORM | EPOLLWRBAND);
3239
3240                 /* Should agree with poll, otherwise some programs break */
3241                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3242         }
3243
3244         rcu_read_unlock();
3245 }
3246
3247 /* An optimised version of sock_def_write_space(), should only be called
3248  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3249  * ->sk_wmem_alloc.
3250  */
3251 static void sock_def_write_space_wfree(struct sock *sk)
3252 {
3253         /* Do not wake up a writer until he can make "significant"
3254          * progress.  --DaveM
3255          */
3256         if (sock_writeable(sk)) {
3257                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3258
3259                 /* rely on refcount_sub from sock_wfree() */
3260                 smp_mb__after_atomic();
3261                 if (wq && waitqueue_active(&wq->wait))
3262                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3263                                                 EPOLLWRNORM | EPOLLWRBAND);
3264
3265                 /* Should agree with poll, otherwise some programs break */
3266                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3267         }
3268 }
3269
3270 static void sock_def_destruct(struct sock *sk)
3271 {
3272 }
3273
3274 void sk_send_sigurg(struct sock *sk)
3275 {
3276         if (sk->sk_socket && sk->sk_socket->file)
3277                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3278                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3279 }
3280 EXPORT_SYMBOL(sk_send_sigurg);
3281
3282 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3283                     unsigned long expires)
3284 {
3285         if (!mod_timer(timer, expires))
3286                 sock_hold(sk);
3287 }
3288 EXPORT_SYMBOL(sk_reset_timer);
3289
3290 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3291 {
3292         if (del_timer(timer))
3293                 __sock_put(sk);
3294 }
3295 EXPORT_SYMBOL(sk_stop_timer);
3296
3297 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3298 {
3299         if (del_timer_sync(timer))
3300                 __sock_put(sk);
3301 }
3302 EXPORT_SYMBOL(sk_stop_timer_sync);
3303
3304 void sock_init_data(struct socket *sock, struct sock *sk)
3305 {
3306         sk_init_common(sk);
3307         sk->sk_send_head        =       NULL;
3308
3309         timer_setup(&sk->sk_timer, NULL, 0);
3310
3311         sk->sk_allocation       =       GFP_KERNEL;
3312         sk->sk_rcvbuf           =       sysctl_rmem_default;
3313         sk->sk_sndbuf           =       sysctl_wmem_default;
3314         sk->sk_state            =       TCP_CLOSE;
3315         sk_set_socket(sk, sock);
3316
3317         sock_set_flag(sk, SOCK_ZAPPED);
3318
3319         if (sock) {
3320                 sk->sk_type     =       sock->type;
3321                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3322                 sock->sk        =       sk;
3323                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
3324         } else {
3325                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3326                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
3327         }
3328
3329         rwlock_init(&sk->sk_callback_lock);
3330         if (sk->sk_kern_sock)
3331                 lockdep_set_class_and_name(
3332                         &sk->sk_callback_lock,
3333                         af_kern_callback_keys + sk->sk_family,
3334                         af_family_kern_clock_key_strings[sk->sk_family]);
3335         else
3336                 lockdep_set_class_and_name(
3337                         &sk->sk_callback_lock,
3338                         af_callback_keys + sk->sk_family,
3339                         af_family_clock_key_strings[sk->sk_family]);
3340
3341         sk->sk_state_change     =       sock_def_wakeup;
3342         sk->sk_data_ready       =       sock_def_readable;
3343         sk->sk_write_space      =       sock_def_write_space;
3344         sk->sk_error_report     =       sock_def_error_report;
3345         sk->sk_destruct         =       sock_def_destruct;
3346
3347         sk->sk_frag.page        =       NULL;
3348         sk->sk_frag.offset      =       0;
3349         sk->sk_peek_off         =       -1;
3350
3351         sk->sk_peer_pid         =       NULL;
3352         sk->sk_peer_cred        =       NULL;
3353         spin_lock_init(&sk->sk_peer_lock);
3354
3355         sk->sk_write_pending    =       0;
3356         sk->sk_rcvlowat         =       1;
3357         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3358         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3359
3360         sk->sk_stamp = SK_DEFAULT_STAMP;
3361 #if BITS_PER_LONG==32
3362         seqlock_init(&sk->sk_stamp_seq);
3363 #endif
3364         atomic_set(&sk->sk_zckey, 0);
3365
3366 #ifdef CONFIG_NET_RX_BUSY_POLL
3367         sk->sk_napi_id          =       0;
3368         sk->sk_ll_usec          =       sysctl_net_busy_read;
3369 #endif
3370
3371         sk->sk_max_pacing_rate = ~0UL;
3372         sk->sk_pacing_rate = ~0UL;
3373         WRITE_ONCE(sk->sk_pacing_shift, 10);
3374         sk->sk_incoming_cpu = -1;
3375         sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
3376
3377         sk_rx_queue_clear(sk);
3378         /*
3379          * Before updating sk_refcnt, we must commit prior changes to memory
3380          * (Documentation/RCU/rculist_nulls.rst for details)
3381          */
3382         smp_wmb();
3383         refcount_set(&sk->sk_refcnt, 1);
3384         atomic_set(&sk->sk_drops, 0);
3385 }
3386 EXPORT_SYMBOL(sock_init_data);
3387
3388 void lock_sock_nested(struct sock *sk, int subclass)
3389 {
3390         /* The sk_lock has mutex_lock() semantics here. */
3391         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3392
3393         might_sleep();
3394         spin_lock_bh(&sk->sk_lock.slock);
3395         if (sock_owned_by_user_nocheck(sk))
3396                 __lock_sock(sk);
3397         sk->sk_lock.owned = 1;
3398         spin_unlock_bh(&sk->sk_lock.slock);
3399 }
3400 EXPORT_SYMBOL(lock_sock_nested);
3401
3402 void release_sock(struct sock *sk)
3403 {
3404         spin_lock_bh(&sk->sk_lock.slock);
3405         if (sk->sk_backlog.tail)
3406                 __release_sock(sk);
3407
3408         /* Warning : release_cb() might need to release sk ownership,
3409          * ie call sock_release_ownership(sk) before us.
3410          */
3411         if (sk->sk_prot->release_cb)
3412                 sk->sk_prot->release_cb(sk);
3413
3414         sock_release_ownership(sk);
3415         if (waitqueue_active(&sk->sk_lock.wq))
3416                 wake_up(&sk->sk_lock.wq);
3417         spin_unlock_bh(&sk->sk_lock.slock);
3418 }
3419 EXPORT_SYMBOL(release_sock);
3420
3421 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3422 {
3423         might_sleep();
3424         spin_lock_bh(&sk->sk_lock.slock);
3425
3426         if (!sock_owned_by_user_nocheck(sk)) {
3427                 /*
3428                  * Fast path return with bottom halves disabled and
3429                  * sock::sk_lock.slock held.
3430                  *
3431                  * The 'mutex' is not contended and holding
3432                  * sock::sk_lock.slock prevents all other lockers to
3433                  * proceed so the corresponding unlock_sock_fast() can
3434                  * avoid the slow path of release_sock() completely and
3435                  * just release slock.
3436                  *
3437                  * From a semantical POV this is equivalent to 'acquiring'
3438                  * the 'mutex', hence the corresponding lockdep
3439                  * mutex_release() has to happen in the fast path of
3440                  * unlock_sock_fast().
3441                  */
3442                 return false;
3443         }
3444
3445         __lock_sock(sk);
3446         sk->sk_lock.owned = 1;
3447         __acquire(&sk->sk_lock.slock);
3448         spin_unlock_bh(&sk->sk_lock.slock);
3449         return true;
3450 }
3451 EXPORT_SYMBOL(__lock_sock_fast);
3452
3453 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3454                    bool timeval, bool time32)
3455 {
3456         struct sock *sk = sock->sk;
3457         struct timespec64 ts;
3458
3459         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3460         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3461         if (ts.tv_sec == -1)
3462                 return -ENOENT;
3463         if (ts.tv_sec == 0) {
3464                 ktime_t kt = ktime_get_real();
3465                 sock_write_timestamp(sk, kt);
3466                 ts = ktime_to_timespec64(kt);
3467         }
3468
3469         if (timeval)
3470                 ts.tv_nsec /= 1000;
3471
3472 #ifdef CONFIG_COMPAT_32BIT_TIME
3473         if (time32)
3474                 return put_old_timespec32(&ts, userstamp);
3475 #endif
3476 #ifdef CONFIG_SPARC64
3477         /* beware of padding in sparc64 timeval */
3478         if (timeval && !in_compat_syscall()) {
3479                 struct __kernel_old_timeval __user tv = {
3480                         .tv_sec = ts.tv_sec,
3481                         .tv_usec = ts.tv_nsec,
3482                 };
3483                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3484                         return -EFAULT;
3485                 return 0;
3486         }
3487 #endif
3488         return put_timespec64(&ts, userstamp);
3489 }
3490 EXPORT_SYMBOL(sock_gettstamp);
3491
3492 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3493 {
3494         if (!sock_flag(sk, flag)) {
3495                 unsigned long previous_flags = sk->sk_flags;
3496
3497                 sock_set_flag(sk, flag);
3498                 /*
3499                  * we just set one of the two flags which require net
3500                  * time stamping, but time stamping might have been on
3501                  * already because of the other one
3502                  */
3503                 if (sock_needs_netstamp(sk) &&
3504                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3505                         net_enable_timestamp();
3506         }
3507 }
3508
3509 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3510                        int level, int type)
3511 {
3512         struct sock_exterr_skb *serr;
3513         struct sk_buff *skb;
3514         int copied, err;
3515
3516         err = -EAGAIN;
3517         skb = sock_dequeue_err_skb(sk);
3518         if (skb == NULL)
3519                 goto out;
3520
3521         copied = skb->len;
3522         if (copied > len) {
3523                 msg->msg_flags |= MSG_TRUNC;
3524                 copied = len;
3525         }
3526         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3527         if (err)
3528                 goto out_free_skb;
3529
3530         sock_recv_timestamp(msg, sk, skb);
3531
3532         serr = SKB_EXT_ERR(skb);
3533         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3534
3535         msg->msg_flags |= MSG_ERRQUEUE;
3536         err = copied;
3537
3538 out_free_skb:
3539         kfree_skb(skb);
3540 out:
3541         return err;
3542 }
3543 EXPORT_SYMBOL(sock_recv_errqueue);
3544
3545 /*
3546  *      Get a socket option on an socket.
3547  *
3548  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3549  *      asynchronous errors should be reported by getsockopt. We assume
3550  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3551  */
3552 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3553                            char __user *optval, int __user *optlen)
3554 {
3555         struct sock *sk = sock->sk;
3556
3557         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3558 }
3559 EXPORT_SYMBOL(sock_common_getsockopt);
3560
3561 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3562                         int flags)
3563 {
3564         struct sock *sk = sock->sk;
3565         int addr_len = 0;
3566         int err;
3567
3568         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3569         if (err >= 0)
3570                 msg->msg_namelen = addr_len;
3571         return err;
3572 }
3573 EXPORT_SYMBOL(sock_common_recvmsg);
3574
3575 /*
3576  *      Set socket options on an inet socket.
3577  */
3578 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3579                            sockptr_t optval, unsigned int optlen)
3580 {
3581         struct sock *sk = sock->sk;
3582
3583         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3584 }
3585 EXPORT_SYMBOL(sock_common_setsockopt);
3586
3587 void sk_common_release(struct sock *sk)
3588 {
3589         if (sk->sk_prot->destroy)
3590                 sk->sk_prot->destroy(sk);
3591
3592         /*
3593          * Observation: when sk_common_release is called, processes have
3594          * no access to socket. But net still has.
3595          * Step one, detach it from networking:
3596          *
3597          * A. Remove from hash tables.
3598          */
3599
3600         sk->sk_prot->unhash(sk);
3601
3602         /*
3603          * In this point socket cannot receive new packets, but it is possible
3604          * that some packets are in flight because some CPU runs receiver and
3605          * did hash table lookup before we unhashed socket. They will achieve
3606          * receive queue and will be purged by socket destructor.
3607          *
3608          * Also we still have packets pending on receive queue and probably,
3609          * our own packets waiting in device queues. sock_destroy will drain
3610          * receive queue, but transmitted packets will delay socket destruction
3611          * until the last reference will be released.
3612          */
3613
3614         sock_orphan(sk);
3615
3616         xfrm_sk_free_policy(sk);
3617
3618         sk_refcnt_debug_release(sk);
3619
3620         sock_put(sk);
3621 }
3622 EXPORT_SYMBOL(sk_common_release);
3623
3624 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3625 {
3626         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3627
3628         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3629         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3630         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3631         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3632         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3633         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3634         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3635         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3636         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3637 }
3638
3639 #ifdef CONFIG_PROC_FS
3640 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3641
3642 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3643 {
3644         int cpu, idx = prot->inuse_idx;
3645         int res = 0;
3646
3647         for_each_possible_cpu(cpu)
3648                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3649
3650         return res >= 0 ? res : 0;
3651 }
3652 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3653
3654 int sock_inuse_get(struct net *net)
3655 {
3656         int cpu, res = 0;
3657
3658         for_each_possible_cpu(cpu)
3659                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3660
3661         return res;
3662 }
3663
3664 EXPORT_SYMBOL_GPL(sock_inuse_get);
3665
3666 static int __net_init sock_inuse_init_net(struct net *net)
3667 {
3668         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3669         if (net->core.prot_inuse == NULL)
3670                 return -ENOMEM;
3671         return 0;
3672 }
3673
3674 static void __net_exit sock_inuse_exit_net(struct net *net)
3675 {
3676         free_percpu(net->core.prot_inuse);
3677 }
3678
3679 static struct pernet_operations net_inuse_ops = {
3680         .init = sock_inuse_init_net,
3681         .exit = sock_inuse_exit_net,
3682 };
3683
3684 static __init int net_inuse_init(void)
3685 {
3686         if (register_pernet_subsys(&net_inuse_ops))
3687                 panic("Cannot initialize net inuse counters");
3688
3689         return 0;
3690 }
3691
3692 core_initcall(net_inuse_init);
3693
3694 static int assign_proto_idx(struct proto *prot)
3695 {
3696         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3697
3698         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3699                 pr_err("PROTO_INUSE_NR exhausted\n");
3700                 return -ENOSPC;
3701         }
3702
3703         set_bit(prot->inuse_idx, proto_inuse_idx);
3704         return 0;
3705 }
3706
3707 static void release_proto_idx(struct proto *prot)
3708 {
3709         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3710                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3711 }
3712 #else
3713 static inline int assign_proto_idx(struct proto *prot)
3714 {
3715         return 0;
3716 }
3717
3718 static inline void release_proto_idx(struct proto *prot)
3719 {
3720 }
3721
3722 #endif
3723
3724 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3725 {
3726         if (!twsk_prot)
3727                 return;
3728         kfree(twsk_prot->twsk_slab_name);
3729         twsk_prot->twsk_slab_name = NULL;
3730         kmem_cache_destroy(twsk_prot->twsk_slab);
3731         twsk_prot->twsk_slab = NULL;
3732 }
3733
3734 static int tw_prot_init(const struct proto *prot)
3735 {
3736         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3737
3738         if (!twsk_prot)
3739                 return 0;
3740
3741         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3742                                               prot->name);
3743         if (!twsk_prot->twsk_slab_name)
3744                 return -ENOMEM;
3745
3746         twsk_prot->twsk_slab =
3747                 kmem_cache_create(twsk_prot->twsk_slab_name,
3748                                   twsk_prot->twsk_obj_size, 0,
3749                                   SLAB_ACCOUNT | prot->slab_flags,
3750                                   NULL);
3751         if (!twsk_prot->twsk_slab) {
3752                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3753                         prot->name);
3754                 return -ENOMEM;
3755         }
3756
3757         return 0;
3758 }
3759
3760 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3761 {
3762         if (!rsk_prot)
3763                 return;
3764         kfree(rsk_prot->slab_name);
3765         rsk_prot->slab_name = NULL;
3766         kmem_cache_destroy(rsk_prot->slab);
3767         rsk_prot->slab = NULL;
3768 }
3769
3770 static int req_prot_init(const struct proto *prot)
3771 {
3772         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3773
3774         if (!rsk_prot)
3775                 return 0;
3776
3777         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3778                                         prot->name);
3779         if (!rsk_prot->slab_name)
3780                 return -ENOMEM;
3781
3782         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3783                                            rsk_prot->obj_size, 0,
3784                                            SLAB_ACCOUNT | prot->slab_flags,
3785                                            NULL);
3786
3787         if (!rsk_prot->slab) {
3788                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3789                         prot->name);
3790                 return -ENOMEM;
3791         }
3792         return 0;
3793 }
3794
3795 int proto_register(struct proto *prot, int alloc_slab)
3796 {
3797         int ret = -ENOBUFS;
3798
3799         if (prot->memory_allocated && !prot->sysctl_mem) {
3800                 pr_err("%s: missing sysctl_mem\n", prot->name);
3801                 return -EINVAL;
3802         }
3803         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3804                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3805                 return -EINVAL;
3806         }
3807         if (alloc_slab) {
3808                 prot->slab = kmem_cache_create_usercopy(prot->name,
3809                                         prot->obj_size, 0,
3810                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3811                                         prot->slab_flags,
3812                                         prot->useroffset, prot->usersize,
3813                                         NULL);
3814
3815                 if (prot->slab == NULL) {
3816                         pr_crit("%s: Can't create sock SLAB cache!\n",
3817                                 prot->name);
3818                         goto out;
3819                 }
3820
3821                 if (req_prot_init(prot))
3822                         goto out_free_request_sock_slab;
3823
3824                 if (tw_prot_init(prot))
3825                         goto out_free_timewait_sock_slab;
3826         }
3827
3828         mutex_lock(&proto_list_mutex);
3829         ret = assign_proto_idx(prot);
3830         if (ret) {
3831                 mutex_unlock(&proto_list_mutex);
3832                 goto out_free_timewait_sock_slab;
3833         }
3834         list_add(&prot->node, &proto_list);
3835         mutex_unlock(&proto_list_mutex);
3836         return ret;
3837
3838 out_free_timewait_sock_slab:
3839         if (alloc_slab)
3840                 tw_prot_cleanup(prot->twsk_prot);
3841 out_free_request_sock_slab:
3842         if (alloc_slab) {
3843                 req_prot_cleanup(prot->rsk_prot);
3844
3845                 kmem_cache_destroy(prot->slab);
3846                 prot->slab = NULL;
3847         }
3848 out:
3849         return ret;
3850 }
3851 EXPORT_SYMBOL(proto_register);
3852
3853 void proto_unregister(struct proto *prot)
3854 {
3855         mutex_lock(&proto_list_mutex);
3856         release_proto_idx(prot);
3857         list_del(&prot->node);
3858         mutex_unlock(&proto_list_mutex);
3859
3860         kmem_cache_destroy(prot->slab);
3861         prot->slab = NULL;
3862
3863         req_prot_cleanup(prot->rsk_prot);
3864         tw_prot_cleanup(prot->twsk_prot);
3865 }
3866 EXPORT_SYMBOL(proto_unregister);
3867
3868 int sock_load_diag_module(int family, int protocol)
3869 {
3870         if (!protocol) {
3871                 if (!sock_is_registered(family))
3872                         return -ENOENT;
3873
3874                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3875                                       NETLINK_SOCK_DIAG, family);
3876         }
3877
3878 #ifdef CONFIG_INET
3879         if (family == AF_INET &&
3880             protocol != IPPROTO_RAW &&
3881             protocol < MAX_INET_PROTOS &&
3882             !rcu_access_pointer(inet_protos[protocol]))
3883                 return -ENOENT;
3884 #endif
3885
3886         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3887                               NETLINK_SOCK_DIAG, family, protocol);
3888 }
3889 EXPORT_SYMBOL(sock_load_diag_module);
3890
3891 #ifdef CONFIG_PROC_FS
3892 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3893         __acquires(proto_list_mutex)
3894 {
3895         mutex_lock(&proto_list_mutex);
3896         return seq_list_start_head(&proto_list, *pos);
3897 }
3898
3899 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3900 {
3901         return seq_list_next(v, &proto_list, pos);
3902 }
3903
3904 static void proto_seq_stop(struct seq_file *seq, void *v)
3905         __releases(proto_list_mutex)
3906 {
3907         mutex_unlock(&proto_list_mutex);
3908 }
3909
3910 static char proto_method_implemented(const void *method)
3911 {
3912         return method == NULL ? 'n' : 'y';
3913 }
3914 static long sock_prot_memory_allocated(struct proto *proto)
3915 {
3916         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3917 }
3918
3919 static const char *sock_prot_memory_pressure(struct proto *proto)
3920 {
3921         return proto->memory_pressure != NULL ?
3922         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3923 }
3924
3925 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3926 {
3927
3928         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3929                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3930                    proto->name,
3931                    proto->obj_size,
3932                    sock_prot_inuse_get(seq_file_net(seq), proto),
3933                    sock_prot_memory_allocated(proto),
3934                    sock_prot_memory_pressure(proto),
3935                    proto->max_header,
3936                    proto->slab == NULL ? "no" : "yes",
3937                    module_name(proto->owner),
3938                    proto_method_implemented(proto->close),
3939                    proto_method_implemented(proto->connect),
3940                    proto_method_implemented(proto->disconnect),
3941                    proto_method_implemented(proto->accept),
3942                    proto_method_implemented(proto->ioctl),
3943                    proto_method_implemented(proto->init),
3944                    proto_method_implemented(proto->destroy),
3945                    proto_method_implemented(proto->shutdown),
3946                    proto_method_implemented(proto->setsockopt),
3947                    proto_method_implemented(proto->getsockopt),
3948                    proto_method_implemented(proto->sendmsg),
3949                    proto_method_implemented(proto->recvmsg),
3950                    proto_method_implemented(proto->sendpage),
3951                    proto_method_implemented(proto->bind),
3952                    proto_method_implemented(proto->backlog_rcv),
3953                    proto_method_implemented(proto->hash),
3954                    proto_method_implemented(proto->unhash),
3955                    proto_method_implemented(proto->get_port),
3956                    proto_method_implemented(proto->enter_memory_pressure));
3957 }
3958
3959 static int proto_seq_show(struct seq_file *seq, void *v)
3960 {
3961         if (v == &proto_list)
3962                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3963                            "protocol",
3964                            "size",
3965                            "sockets",
3966                            "memory",
3967                            "press",
3968                            "maxhdr",
3969                            "slab",
3970                            "module",
3971                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3972         else
3973                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3974         return 0;
3975 }
3976
3977 static const struct seq_operations proto_seq_ops = {
3978         .start  = proto_seq_start,
3979         .next   = proto_seq_next,
3980         .stop   = proto_seq_stop,
3981         .show   = proto_seq_show,
3982 };
3983
3984 static __net_init int proto_init_net(struct net *net)
3985 {
3986         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3987                         sizeof(struct seq_net_private)))
3988                 return -ENOMEM;
3989
3990         return 0;
3991 }
3992
3993 static __net_exit void proto_exit_net(struct net *net)
3994 {
3995         remove_proc_entry("protocols", net->proc_net);
3996 }
3997
3998
3999 static __net_initdata struct pernet_operations proto_net_ops = {
4000         .init = proto_init_net,
4001         .exit = proto_exit_net,
4002 };
4003
4004 static int __init proto_init(void)
4005 {
4006         return register_pernet_subsys(&proto_net_ops);
4007 }
4008
4009 subsys_initcall(proto_init);
4010
4011 #endif /* PROC_FS */
4012
4013 #ifdef CONFIG_NET_RX_BUSY_POLL
4014 bool sk_busy_loop_end(void *p, unsigned long start_time)
4015 {
4016         struct sock *sk = p;
4017
4018         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4019                sk_busy_loop_timeout(sk, start_time);
4020 }
4021 EXPORT_SYMBOL(sk_busy_loop_end);
4022 #endif /* CONFIG_NET_RX_BUSY_POLL */
4023
4024 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4025 {
4026         if (!sk->sk_prot->bind_add)
4027                 return -EOPNOTSUPP;
4028         return sk->sk_prot->bind_add(sk, addr, addr_len);
4029 }
4030 EXPORT_SYMBOL(sock_bind_add);