net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117
 118 #include <linux/uaccess.h>
 119
 120 #include <linux/netdevice.h>
 121 #include <net/protocol.h>
 122 #include <linux/skbuff.h>
 123 #include <net/net_namespace.h>
 124 #include <net/request_sock.h>
 125 #include <net/sock.h>
 126 #include <linux/net_tstamp.h>
 127 #include <net/xfrm.h>
 128 #include <linux/ipsec.h>
 129 #include <net/cls_cgroup.h>
 130 #include <net/netprio_cgroup.h>
 131 #include <linux/sock_diag.h>
 132
 133 #include <linux/filter.h>
 134 #include <net/sock_reuseport.h>
 135 #include <net/bpf_sk_storage.h>
 136
 137 #include <trace/events/sock.h>
 138
 139 #include <net/tcp.h>
 140 #include <net/busy_poll.h>
 141
 142 #include <linux/ethtool.h>
 143
 144 #include "dev.h"
 145
 146 static DEFINE_MUTEX(proto_list_mutex);
 147 static LIST_HEAD(proto_list);
 148
 149 static void sock_def_write_space_wfree(struct sock *sk);
 150 static void sock_def_write_space(struct sock *sk);
 151
 152 /**
 153  * sk_ns_capable - General socket capability test
 154  * @sk: Socket to use a capability on or through
 155  * @user_ns: The user namespace of the capability to use
 156  * @cap: The capability to use
 157  *
 158  * Test to see if the opener of the socket had when the socket was
 159  * created and the current process has the capability @cap in the user
 160  * namespace @user_ns.
 161  */
 162 bool sk_ns_capable(const struct sock *sk,
 163                    struct user_namespace *user_ns, int cap)
 164 {
 165         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 166                 ns_capable(user_ns, cap);
 167 }
 168 EXPORT_SYMBOL(sk_ns_capable);
 169
 170 /**
 171  * sk_capable - Socket global capability test
 172  * @sk: Socket to use a capability on or through
 173  * @cap: The global capability to use
 174  *
 175  * Test to see if the opener of the socket had when the socket was
 176  * created and the current process has the capability @cap in all user
 177  * namespaces.
 178  */
 179 bool sk_capable(const struct sock *sk, int cap)
 180 {
 181         return sk_ns_capable(sk, &init_user_ns, cap);
 182 }
 183 EXPORT_SYMBOL(sk_capable);
 184
 185 /**
 186  * sk_net_capable - Network namespace socket capability test
 187  * @sk: Socket to use a capability on or through
 188  * @cap: The capability to use
 189  *
 190  * Test to see if the opener of the socket had when the socket was created
 191  * and the current process has the capability @cap over the network namespace
 192  * the socket is a member of.
 193  */
 194 bool sk_net_capable(const struct sock *sk, int cap)
 195 {
 196         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 197 }
 198 EXPORT_SYMBOL(sk_net_capable);
 199
 200 /*
 201  * Each address family might have different locking rules, so we have
 202  * one slock key per address family and separate keys for internal and
 203  * userspace sockets.
 204  */
 205 static struct lock_class_key af_family_keys[AF_MAX];
 206 static struct lock_class_key af_family_kern_keys[AF_MAX];
 207 static struct lock_class_key af_family_slock_keys[AF_MAX];
 208 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 209
 210 /*
 211  * Make lock validator output more readable. (we pre-construct these
 212  * strings build-time, so that runtime initialization of socket
 213  * locks is fast):
 214  */
 215
 216 #define _sock_locks(x)                                            \
 217   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 218   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 219   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 220   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 221   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 222   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 223   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 224   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 225   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 226   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 227   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 228   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 229   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 230   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 231   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 232   x "AF_MCTP"  , \
 233   x "AF_MAX"
 234
 235 static const char *const af_family_key_strings[AF_MAX+1] = {
 236         _sock_locks("sk_lock-")
 237 };
 238 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 239         _sock_locks("slock-")
 240 };
 241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 242         _sock_locks("clock-")
 243 };
 244
 245 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-sk_lock-")
 247 };
 248 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 249         _sock_locks("k-slock-")
 250 };
 251 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 252         _sock_locks("k-clock-")
 253 };
 254 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 255         _sock_locks("rlock-")
 256 };
 257 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 258         _sock_locks("wlock-")
 259 };
 260 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 261         _sock_locks("elock-")
 262 };
 263
 264 /*
 265  * sk_callback_lock and sk queues locking rules are per-address-family,
 266  * so split the lock classes by using a per-AF key:
 267  */
 268 static struct lock_class_key af_callback_keys[AF_MAX];
 269 static struct lock_class_key af_rlock_keys[AF_MAX];
 270 static struct lock_class_key af_wlock_keys[AF_MAX];
 271 static struct lock_class_key af_elock_keys[AF_MAX];
 272 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 273
 274 /* Run time adjustable parameters. */
 275 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 276 EXPORT_SYMBOL(sysctl_wmem_max);
 277 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 278 EXPORT_SYMBOL(sysctl_rmem_max);
 279 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 280 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 281
 282 /* Maximal space eaten by iovec or ancillary data plus some space */
 283 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 284 EXPORT_SYMBOL(sysctl_optmem_max);
 285
 286 int sysctl_tstamp_allow_data __read_mostly = 1;
 287
 288 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 289 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 290
 291 /**
 292  * sk_set_memalloc - sets %SOCK_MEMALLOC
 293  * @sk: socket to set it on
 294  *
 295  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 296  * It's the responsibility of the admin to adjust min_free_kbytes
 297  * to meet the requirements
 298  */
 299 void sk_set_memalloc(struct sock *sk)
 300 {
 301         sock_set_flag(sk, SOCK_MEMALLOC);
 302         sk->sk_allocation |= __GFP_MEMALLOC;
 303         static_branch_inc(&memalloc_socks_key);
 304 }
 305 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 306
 307 void sk_clear_memalloc(struct sock *sk)
 308 {
 309         sock_reset_flag(sk, SOCK_MEMALLOC);
 310         sk->sk_allocation &= ~__GFP_MEMALLOC;
 311         static_branch_dec(&memalloc_socks_key);
 312
 313         /*
 314          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 315          * progress of swapping. SOCK_MEMALLOC may be cleared while
 316          * it has rmem allocations due to the last swapfile being deactivated
 317          * but there is a risk that the socket is unusable due to exceeding
 318          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 319          */
 320         sk_mem_reclaim(sk);
 321 }
 322 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 323
 324 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 325 {
 326         int ret;
 327         unsigned int noreclaim_flag;
 328
 329         /* these should have been dropped before queueing */
 330         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 331
 332         noreclaim_flag = memalloc_noreclaim_save();
 333         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 334                                  tcp_v6_do_rcv,
 335                                  tcp_v4_do_rcv,
 336                                  sk, skb);
 337         memalloc_noreclaim_restore(noreclaim_flag);
 338
 339         return ret;
 340 }
 341 EXPORT_SYMBOL(__sk_backlog_rcv);
 342
 343 void sk_error_report(struct sock *sk)
 344 {
 345         sk->sk_error_report(sk);
 346
 347         switch (sk->sk_family) {
 348         case AF_INET:
 349                 fallthrough;
 350         case AF_INET6:
 351                 trace_inet_sk_error_report(sk);
 352                 break;
 353         default:
 354                 break;
 355         }
 356 }
 357 EXPORT_SYMBOL(sk_error_report);
 358
 359 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 360 {
 361         struct __kernel_sock_timeval tv;
 362
 363         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 364                 tv.tv_sec = 0;
 365                 tv.tv_usec = 0;
 366         } else {
 367                 tv.tv_sec = timeo / HZ;
 368                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 369         }
 370
 371         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 372                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 373                 *(struct old_timeval32 *)optval = tv32;
 374                 return sizeof(tv32);
 375         }
 376
 377         if (old_timeval) {
 378                 struct __kernel_old_timeval old_tv;
 379                 old_tv.tv_sec = tv.tv_sec;
 380                 old_tv.tv_usec = tv.tv_usec;
 381                 *(struct __kernel_old_timeval *)optval = old_tv;
 382                 return sizeof(old_tv);
 383         }
 384
 385         *(struct __kernel_sock_timeval *)optval = tv;
 386         return sizeof(tv);
 387 }
 388 EXPORT_SYMBOL(sock_get_timeout);
 389
 390 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 391                            sockptr_t optval, int optlen, bool old_timeval)
 392 {
 393         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 394                 struct old_timeval32 tv32;
 395
 396                 if (optlen < sizeof(tv32))
 397                         return -EINVAL;
 398
 399                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 400                         return -EFAULT;
 401                 tv->tv_sec = tv32.tv_sec;
 402                 tv->tv_usec = tv32.tv_usec;
 403         } else if (old_timeval) {
 404                 struct __kernel_old_timeval old_tv;
 405
 406                 if (optlen < sizeof(old_tv))
 407                         return -EINVAL;
 408                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 409                         return -EFAULT;
 410                 tv->tv_sec = old_tv.tv_sec;
 411                 tv->tv_usec = old_tv.tv_usec;
 412         } else {
 413                 if (optlen < sizeof(*tv))
 414                         return -EINVAL;
 415                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 416                         return -EFAULT;
 417         }
 418
 419         return 0;
 420 }
 421 EXPORT_SYMBOL(sock_copy_user_timeval);
 422
 423 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 424                             bool old_timeval)
 425 {
 426         struct __kernel_sock_timeval tv;
 427         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 428
 429         if (err)
 430                 return err;
 431
 432         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 433                 return -EDOM;
 434
 435         if (tv.tv_sec < 0) {
 436                 static int warned __read_mostly;
 437
 438                 *timeo_p = 0;
 439                 if (warned < 10 && net_ratelimit()) {
 440                         warned++;
 441                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 442                                 __func__, current->comm, task_pid_nr(current));
 443                 }
 444                 return 0;
 445         }
 446         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 447         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 448                 return 0;
 449         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 450                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 451         return 0;
 452 }
 453
 454 static bool sock_needs_netstamp(const struct sock *sk)
 455 {
 456         switch (sk->sk_family) {
 457         case AF_UNSPEC:
 458         case AF_UNIX:
 459                 return false;
 460         default:
 461                 return true;
 462         }
 463 }
 464
 465 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 466 {
 467         if (sk->sk_flags & flags) {
 468                 sk->sk_flags &= ~flags;
 469                 if (sock_needs_netstamp(sk) &&
 470                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 471                         net_disable_timestamp();
 472         }
 473 }
 474
 475
 476 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 477 {
 478         unsigned long flags;
 479         struct sk_buff_head *list = &sk->sk_receive_queue;
 480
 481         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 482                 atomic_inc(&sk->sk_drops);
 483                 trace_sock_rcvqueue_full(sk, skb);
 484                 return -ENOMEM;
 485         }
 486
 487         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 488                 atomic_inc(&sk->sk_drops);
 489                 return -ENOBUFS;
 490         }
 491
 492         skb->dev = NULL;
 493         skb_set_owner_r(skb, sk);
 494
 495         /* we escape from rcu protected region, make sure we dont leak
 496          * a norefcounted dst
 497          */
 498         skb_dst_force(skb);
 499
 500         spin_lock_irqsave(&list->lock, flags);
 501         sock_skb_set_dropcount(sk, skb);
 502         __skb_queue_tail(list, skb);
 503         spin_unlock_irqrestore(&list->lock, flags);
 504
 505         if (!sock_flag(sk, SOCK_DEAD))
 506                 sk->sk_data_ready(sk);
 507         return 0;
 508 }
 509 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 510
 511 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 512                               enum skb_drop_reason *reason)
 513 {
 514         enum skb_drop_reason drop_reason;
 515         int err;
 516
 517         err = sk_filter(sk, skb);
 518         if (err) {
 519                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 520                 goto out;
 521         }
 522         err = __sock_queue_rcv_skb(sk, skb);
 523         switch (err) {
 524         case -ENOMEM:
 525                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 526                 break;
 527         case -ENOBUFS:
 528                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 529                 break;
 530         default:
 531                 drop_reason = SKB_NOT_DROPPED_YET;
 532                 break;
 533         }
 534 out:
 535         if (reason)
 536                 *reason = drop_reason;
 537         return err;
 538 }
 539 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 540
 541 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 542                      const int nested, unsigned int trim_cap, bool refcounted)
 543 {
 544         int rc = NET_RX_SUCCESS;
 545
 546         if (sk_filter_trim_cap(sk, skb, trim_cap))
 547                 goto discard_and_relse;
 548
 549         skb->dev = NULL;
 550
 551         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 552                 atomic_inc(&sk->sk_drops);
 553                 goto discard_and_relse;
 554         }
 555         if (nested)
 556                 bh_lock_sock_nested(sk);
 557         else
 558                 bh_lock_sock(sk);
 559         if (!sock_owned_by_user(sk)) {
 560                 /*
 561                  * trylock + unlock semantics:
 562                  */
 563                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 564
 565                 rc = sk_backlog_rcv(sk, skb);
 566
 567                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 568         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 569                 bh_unlock_sock(sk);
 570                 atomic_inc(&sk->sk_drops);
 571                 goto discard_and_relse;
 572         }
 573
 574         bh_unlock_sock(sk);
 575 out:
 576         if (refcounted)
 577                 sock_put(sk);
 578         return rc;
 579 discard_and_relse:
 580         kfree_skb(skb);
 581         goto out;
 582 }
 583 EXPORT_SYMBOL(__sk_receive_skb);
 584
 585 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 586                                                           u32));
 587 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 588                                                            u32));
 589 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 590 {
 591         struct dst_entry *dst = __sk_dst_get(sk);
 592
 593         if (dst && dst->obsolete &&
 594             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 595                                dst, cookie) == NULL) {
 596                 sk_tx_queue_clear(sk);
 597                 sk->sk_dst_pending_confirm = 0;
 598                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 599                 dst_release(dst);
 600                 return NULL;
 601         }
 602
 603         return dst;
 604 }
 605 EXPORT_SYMBOL(__sk_dst_check);
 606
 607 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 608 {
 609         struct dst_entry *dst = sk_dst_get(sk);
 610
 611         if (dst && dst->obsolete &&
 612             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 613                                dst, cookie) == NULL) {
 614                 sk_dst_reset(sk);
 615                 dst_release(dst);
 616                 return NULL;
 617         }
 618
 619         return dst;
 620 }
 621 EXPORT_SYMBOL(sk_dst_check);
 622
 623 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 624 {
 625         int ret = -ENOPROTOOPT;
 626 #ifdef CONFIG_NETDEVICES
 627         struct net *net = sock_net(sk);
 628
 629         /* Sorry... */
 630         ret = -EPERM;
 631         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 632                 goto out;
 633
 634         ret = -EINVAL;
 635         if (ifindex < 0)
 636                 goto out;
 637
 638         /* Paired with all READ_ONCE() done locklessly. */
 639         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 640
 641         if (sk->sk_prot->rehash)
 642                 sk->sk_prot->rehash(sk);
 643         sk_dst_reset(sk);
 644
 645         ret = 0;
 646
 647 out:
 648 #endif
 649
 650         return ret;
 651 }
 652
 653 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 654 {
 655         int ret;
 656
 657         if (lock_sk)
 658                 lock_sock(sk);
 659         ret = sock_bindtoindex_locked(sk, ifindex);
 660         if (lock_sk)
 661                 release_sock(sk);
 662
 663         return ret;
 664 }
 665 EXPORT_SYMBOL(sock_bindtoindex);
 666
 667 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 668 {
 669         int ret = -ENOPROTOOPT;
 670 #ifdef CONFIG_NETDEVICES
 671         struct net *net = sock_net(sk);
 672         char devname[IFNAMSIZ];
 673         int index;
 674
 675         ret = -EINVAL;
 676         if (optlen < 0)
 677                 goto out;
 678
 679         /* Bind this socket to a particular device like "eth0",
 680          * as specified in the passed interface name. If the
 681          * name is "" or the option length is zero the socket
 682          * is not bound.
 683          */
 684         if (optlen > IFNAMSIZ - 1)
 685                 optlen = IFNAMSIZ - 1;
 686         memset(devname, 0, sizeof(devname));
 687
 688         ret = -EFAULT;
 689         if (copy_from_sockptr(devname, optval, optlen))
 690                 goto out;
 691
 692         index = 0;
 693         if (devname[0] != '\0') {
 694                 struct net_device *dev;
 695
 696                 rcu_read_lock();
 697                 dev = dev_get_by_name_rcu(net, devname);
 698                 if (dev)
 699                         index = dev->ifindex;
 700                 rcu_read_unlock();
 701                 ret = -ENODEV;
 702                 if (!dev)
 703                         goto out;
 704         }
 705
 706         return sock_bindtoindex(sk, index, true);
 707 out:
 708 #endif
 709
 710         return ret;
 711 }
 712
 713 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 714                                 int __user *optlen, int len)
 715 {
 716         int ret = -ENOPROTOOPT;
 717 #ifdef CONFIG_NETDEVICES
 718         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 719         struct net *net = sock_net(sk);
 720         char devname[IFNAMSIZ];
 721
 722         if (bound_dev_if == 0) {
 723                 len = 0;
 724                 goto zero;
 725         }
 726
 727         ret = -EINVAL;
 728         if (len < IFNAMSIZ)
 729                 goto out;
 730
 731         ret = netdev_get_name(net, devname, bound_dev_if);
 732         if (ret)
 733                 goto out;
 734
 735         len = strlen(devname) + 1;
 736
 737         ret = -EFAULT;
 738         if (copy_to_user(optval, devname, len))
 739                 goto out;
 740
 741 zero:
 742         ret = -EFAULT;
 743         if (put_user(len, optlen))
 744                 goto out;
 745
 746         ret = 0;
 747
 748 out:
 749 #endif
 750
 751         return ret;
 752 }
 753
 754 bool sk_mc_loop(struct sock *sk)
 755 {
 756         if (dev_recursion_level())
 757                 return false;
 758         if (!sk)
 759                 return true;
 760         switch (sk->sk_family) {
 761         case AF_INET:
 762                 return inet_sk(sk)->mc_loop;
 763 #if IS_ENABLED(CONFIG_IPV6)
 764         case AF_INET6:
 765                 return inet6_sk(sk)->mc_loop;
 766 #endif
 767         }
 768         WARN_ON_ONCE(1);
 769         return true;
 770 }
 771 EXPORT_SYMBOL(sk_mc_loop);
 772
 773 void sock_set_reuseaddr(struct sock *sk)
 774 {
 775         lock_sock(sk);
 776         sk->sk_reuse = SK_CAN_REUSE;
 777         release_sock(sk);
 778 }
 779 EXPORT_SYMBOL(sock_set_reuseaddr);
 780
 781 void sock_set_reuseport(struct sock *sk)
 782 {
 783         lock_sock(sk);
 784         sk->sk_reuseport = true;
 785         release_sock(sk);
 786 }
 787 EXPORT_SYMBOL(sock_set_reuseport);
 788
 789 void sock_no_linger(struct sock *sk)
 790 {
 791         lock_sock(sk);
 792         sk->sk_lingertime = 0;
 793         sock_set_flag(sk, SOCK_LINGER);
 794         release_sock(sk);
 795 }
 796 EXPORT_SYMBOL(sock_no_linger);
 797
 798 void sock_set_priority(struct sock *sk, u32 priority)
 799 {
 800         lock_sock(sk);
 801         sk->sk_priority = priority;
 802         release_sock(sk);
 803 }
 804 EXPORT_SYMBOL(sock_set_priority);
 805
 806 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 807 {
 808         lock_sock(sk);
 809         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 810                 sk->sk_sndtimeo = secs * HZ;
 811         else
 812                 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 813         release_sock(sk);
 814 }
 815 EXPORT_SYMBOL(sock_set_sndtimeo);
 816
 817 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 818 {
 819         if (val)  {
 820                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 821                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 822                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 823                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 824         } else {
 825                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 826                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 827         }
 828 }
 829
 830 void sock_enable_timestamps(struct sock *sk)
 831 {
 832         lock_sock(sk);
 833         __sock_set_timestamps(sk, true, false, true);
 834         release_sock(sk);
 835 }
 836 EXPORT_SYMBOL(sock_enable_timestamps);
 837
 838 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 839 {
 840         switch (optname) {
 841         case SO_TIMESTAMP_OLD:
 842                 __sock_set_timestamps(sk, valbool, false, false);
 843                 break;
 844         case SO_TIMESTAMP_NEW:
 845                 __sock_set_timestamps(sk, valbool, true, false);
 846                 break;
 847         case SO_TIMESTAMPNS_OLD:
 848                 __sock_set_timestamps(sk, valbool, false, true);
 849                 break;
 850         case SO_TIMESTAMPNS_NEW:
 851                 __sock_set_timestamps(sk, valbool, true, true);
 852                 break;
 853         }
 854 }
 855
 856 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 857 {
 858         struct net *net = sock_net(sk);
 859         struct net_device *dev = NULL;
 860         bool match = false;
 861         int *vclock_index;
 862         int i, num;
 863
 864         if (sk->sk_bound_dev_if)
 865                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 866
 867         if (!dev) {
 868                 pr_err("%s: sock not bind to device\n", __func__);
 869                 return -EOPNOTSUPP;
 870         }
 871
 872         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 873         dev_put(dev);
 874
 875         for (i = 0; i < num; i++) {
 876                 if (*(vclock_index + i) == phc_index) {
 877                         match = true;
 878                         break;
 879                 }
 880         }
 881
 882         if (num > 0)
 883                 kfree(vclock_index);
 884
 885         if (!match)
 886                 return -EINVAL;
 887
 888         sk->sk_bind_phc = phc_index;
 889
 890         return 0;
 891 }
 892
 893 int sock_set_timestamping(struct sock *sk, int optname,
 894                           struct so_timestamping timestamping)
 895 {
 896         int val = timestamping.flags;
 897         int ret;
 898
 899         if (val & ~SOF_TIMESTAMPING_MASK)
 900                 return -EINVAL;
 901
 902         if (val & SOF_TIMESTAMPING_OPT_ID &&
 903             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 904                 if (sk_is_tcp(sk)) {
 905                         if ((1 << sk->sk_state) &
 906                             (TCPF_CLOSE | TCPF_LISTEN))
 907                                 return -EINVAL;
 908                         atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 909                 } else {
 910                         atomic_set(&sk->sk_tskey, 0);
 911                 }
 912         }
 913
 914         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 915             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 916                 return -EINVAL;
 917
 918         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 919                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 920                 if (ret)
 921                         return ret;
 922         }
 923
 924         sk->sk_tsflags = val;
 925         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 926
 927         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 928                 sock_enable_timestamp(sk,
 929                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 930         else
 931                 sock_disable_timestamp(sk,
 932                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 933         return 0;
 934 }
 935
 936 void sock_set_keepalive(struct sock *sk)
 937 {
 938         lock_sock(sk);
 939         if (sk->sk_prot->keepalive)
 940                 sk->sk_prot->keepalive(sk, true);
 941         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 942         release_sock(sk);
 943 }
 944 EXPORT_SYMBOL(sock_set_keepalive);
 945
 946 static void __sock_set_rcvbuf(struct sock *sk, int val)
 947 {
 948         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 949          * as a negative value.
 950          */
 951         val = min_t(int, val, INT_MAX / 2);
 952         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 953
 954         /* We double it on the way in to account for "struct sk_buff" etc.
 955          * overhead.   Applications assume that the SO_RCVBUF setting they make
 956          * will allow that much actual data to be received on that socket.
 957          *
 958          * Applications are unaware that "struct sk_buff" and other overheads
 959          * allocate from the receive buffer during socket buffer allocation.
 960          *
 961          * And after considering the possible alternatives, returning the value
 962          * we actually used in getsockopt is the most desirable behavior.
 963          */
 964         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 965 }
 966
 967 void sock_set_rcvbuf(struct sock *sk, int val)
 968 {
 969         lock_sock(sk);
 970         __sock_set_rcvbuf(sk, val);
 971         release_sock(sk);
 972 }
 973 EXPORT_SYMBOL(sock_set_rcvbuf);
 974
 975 static void __sock_set_mark(struct sock *sk, u32 val)
 976 {
 977         if (val != sk->sk_mark) {
 978                 sk->sk_mark = val;
 979                 sk_dst_reset(sk);
 980         }
 981 }
 982
 983 void sock_set_mark(struct sock *sk, u32 val)
 984 {
 985         lock_sock(sk);
 986         __sock_set_mark(sk, val);
 987         release_sock(sk);
 988 }
 989 EXPORT_SYMBOL(sock_set_mark);
 990
 991 static void sock_release_reserved_memory(struct sock *sk, int bytes)
 992 {
 993         /* Round down bytes to multiple of pages */
 994         bytes = round_down(bytes, PAGE_SIZE);
 995
 996         WARN_ON(bytes > sk->sk_reserved_mem);
 997         sk->sk_reserved_mem -= bytes;
 998         sk_mem_reclaim(sk);
 999 }
1000
1001 static int sock_reserve_memory(struct sock *sk, int bytes)
1002 {
1003         long allocated;
1004         bool charged;
1005         int pages;
1006
1007         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1008                 return -EOPNOTSUPP;
1009
1010         if (!bytes)
1011                 return 0;
1012
1013         pages = sk_mem_pages(bytes);
1014
1015         /* pre-charge to memcg */
1016         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1017                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1018         if (!charged)
1019                 return -ENOMEM;
1020
1021         /* pre-charge to forward_alloc */
1022         sk_memory_allocated_add(sk, pages);
1023         allocated = sk_memory_allocated(sk);
1024         /* If the system goes into memory pressure with this
1025          * precharge, give up and return error.
1026          */
1027         if (allocated > sk_prot_mem_limits(sk, 1)) {
1028                 sk_memory_allocated_sub(sk, pages);
1029                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1030                 return -ENOMEM;
1031         }
1032         sk->sk_forward_alloc += pages << PAGE_SHIFT;
1033
1034         sk->sk_reserved_mem += pages << PAGE_SHIFT;
1035
1036         return 0;
1037 }
1038
1039 /*
1040  *      This is meant for all protocols to use and covers goings on
1041  *      at the socket level. Everything here is generic.
1042  */
1043
1044 int sock_setsockopt(struct socket *sock, int level, int optname,
1045                     sockptr_t optval, unsigned int optlen)
1046 {
1047         struct so_timestamping timestamping;
1048         struct sock_txtime sk_txtime;
1049         struct sock *sk = sock->sk;
1050         int val;
1051         int valbool;
1052         struct linger ling;
1053         int ret = 0;
1054
1055         /*
1056          *      Options without arguments
1057          */
1058
1059         if (optname == SO_BINDTODEVICE)
1060                 return sock_setbindtodevice(sk, optval, optlen);
1061
1062         if (optlen < sizeof(int))
1063                 return -EINVAL;
1064
1065         if (copy_from_sockptr(&val, optval, sizeof(val)))
1066                 return -EFAULT;
1067
1068         valbool = val ? 1 : 0;
1069
1070         lock_sock(sk);
1071
1072         switch (optname) {
1073         case SO_DEBUG:
1074                 if (val && !capable(CAP_NET_ADMIN))
1075                         ret = -EACCES;
1076                 else
1077                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1078                 break;
1079         case SO_REUSEADDR:
1080                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1081                 break;
1082         case SO_REUSEPORT:
1083                 sk->sk_reuseport = valbool;
1084                 break;
1085         case SO_TYPE:
1086         case SO_PROTOCOL:
1087         case SO_DOMAIN:
1088         case SO_ERROR:
1089                 ret = -ENOPROTOOPT;
1090                 break;
1091         case SO_DONTROUTE:
1092                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1093                 sk_dst_reset(sk);
1094                 break;
1095         case SO_BROADCAST:
1096                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1097                 break;
1098         case SO_SNDBUF:
1099                 /* Don't error on this BSD doesn't and if you think
1100                  * about it this is right. Otherwise apps have to
1101                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1102                  * are treated in BSD as hints
1103                  */
1104                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1105 set_sndbuf:
1106                 /* Ensure val * 2 fits into an int, to prevent max_t()
1107                  * from treating it as a negative value.
1108                  */
1109                 val = min_t(int, val, INT_MAX / 2);
1110                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1111                 WRITE_ONCE(sk->sk_sndbuf,
1112                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1113                 /* Wake up sending tasks if we upped the value. */
1114                 sk->sk_write_space(sk);
1115                 break;
1116
1117         case SO_SNDBUFFORCE:
1118                 if (!capable(CAP_NET_ADMIN)) {
1119                         ret = -EPERM;
1120                         break;
1121                 }
1122
1123                 /* No negative values (to prevent underflow, as val will be
1124                  * multiplied by 2).
1125                  */
1126                 if (val < 0)
1127                         val = 0;
1128                 goto set_sndbuf;
1129
1130         case SO_RCVBUF:
1131                 /* Don't error on this BSD doesn't and if you think
1132                  * about it this is right. Otherwise apps have to
1133                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1134                  * are treated in BSD as hints
1135                  */
1136                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1137                 break;
1138
1139         case SO_RCVBUFFORCE:
1140                 if (!capable(CAP_NET_ADMIN)) {
1141                         ret = -EPERM;
1142                         break;
1143                 }
1144
1145                 /* No negative values (to prevent underflow, as val will be
1146                  * multiplied by 2).
1147                  */
1148                 __sock_set_rcvbuf(sk, max(val, 0));
1149                 break;
1150
1151         case SO_KEEPALIVE:
1152                 if (sk->sk_prot->keepalive)
1153                         sk->sk_prot->keepalive(sk, valbool);
1154                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1155                 break;
1156
1157         case SO_OOBINLINE:
1158                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1159                 break;
1160
1161         case SO_NO_CHECK:
1162                 sk->sk_no_check_tx = valbool;
1163                 break;
1164
1165         case SO_PRIORITY:
1166                 if ((val >= 0 && val <= 6) ||
1167                     ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1168                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1169                         sk->sk_priority = val;
1170                 else
1171                         ret = -EPERM;
1172                 break;
1173
1174         case SO_LINGER:
1175                 if (optlen < sizeof(ling)) {
1176                         ret = -EINVAL;  /* 1003.1g */
1177                         break;
1178                 }
1179                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1180                         ret = -EFAULT;
1181                         break;
1182                 }
1183                 if (!ling.l_onoff)
1184                         sock_reset_flag(sk, SOCK_LINGER);
1185                 else {
1186 #if (BITS_PER_LONG == 32)
1187                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1188                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1189                         else
1190 #endif
1191                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1192                         sock_set_flag(sk, SOCK_LINGER);
1193                 }
1194                 break;
1195
1196         case SO_BSDCOMPAT:
1197                 break;
1198
1199         case SO_PASSCRED:
1200                 if (valbool)
1201                         set_bit(SOCK_PASSCRED, &sock->flags);
1202                 else
1203                         clear_bit(SOCK_PASSCRED, &sock->flags);
1204                 break;
1205
1206         case SO_TIMESTAMP_OLD:
1207         case SO_TIMESTAMP_NEW:
1208         case SO_TIMESTAMPNS_OLD:
1209         case SO_TIMESTAMPNS_NEW:
1210                 sock_set_timestamp(sk, optname, valbool);
1211                 break;
1212
1213         case SO_TIMESTAMPING_NEW:
1214         case SO_TIMESTAMPING_OLD:
1215                 if (optlen == sizeof(timestamping)) {
1216                         if (copy_from_sockptr(&timestamping, optval,
1217                                               sizeof(timestamping))) {
1218                                 ret = -EFAULT;
1219                                 break;
1220                         }
1221                 } else {
1222                         memset(&timestamping, 0, sizeof(timestamping));
1223                         timestamping.flags = val;
1224                 }
1225                 ret = sock_set_timestamping(sk, optname, timestamping);
1226                 break;
1227
1228         case SO_RCVLOWAT:
1229                 if (val < 0)
1230                         val = INT_MAX;
1231                 if (sock->ops->set_rcvlowat)
1232                         ret = sock->ops->set_rcvlowat(sk, val);
1233                 else
1234                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1235                 break;
1236
1237         case SO_RCVTIMEO_OLD:
1238         case SO_RCVTIMEO_NEW:
1239                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1240                                        optlen, optname == SO_RCVTIMEO_OLD);
1241                 break;
1242
1243         case SO_SNDTIMEO_OLD:
1244         case SO_SNDTIMEO_NEW:
1245                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1246                                        optlen, optname == SO_SNDTIMEO_OLD);
1247                 break;
1248
1249         case SO_ATTACH_FILTER: {
1250                 struct sock_fprog fprog;
1251
1252                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1253                 if (!ret)
1254                         ret = sk_attach_filter(&fprog, sk);
1255                 break;
1256         }
1257         case SO_ATTACH_BPF:
1258                 ret = -EINVAL;
1259                 if (optlen == sizeof(u32)) {
1260                         u32 ufd;
1261
1262                         ret = -EFAULT;
1263                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1264                                 break;
1265
1266                         ret = sk_attach_bpf(ufd, sk);
1267                 }
1268                 break;
1269
1270         case SO_ATTACH_REUSEPORT_CBPF: {
1271                 struct sock_fprog fprog;
1272
1273                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1274                 if (!ret)
1275                         ret = sk_reuseport_attach_filter(&fprog, sk);
1276                 break;
1277         }
1278         case SO_ATTACH_REUSEPORT_EBPF:
1279                 ret = -EINVAL;
1280                 if (optlen == sizeof(u32)) {
1281                         u32 ufd;
1282
1283                         ret = -EFAULT;
1284                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1285                                 break;
1286
1287                         ret = sk_reuseport_attach_bpf(ufd, sk);
1288                 }
1289                 break;
1290
1291         case SO_DETACH_REUSEPORT_BPF:
1292                 ret = reuseport_detach_prog(sk);
1293                 break;
1294
1295         case SO_DETACH_FILTER:
1296                 ret = sk_detach_filter(sk);
1297                 break;
1298
1299         case SO_LOCK_FILTER:
1300                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1301                         ret = -EPERM;
1302                 else
1303                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1304                 break;
1305
1306         case SO_PASSSEC:
1307                 if (valbool)
1308                         set_bit(SOCK_PASSSEC, &sock->flags);
1309                 else
1310                         clear_bit(SOCK_PASSSEC, &sock->flags);
1311                 break;
1312         case SO_MARK:
1313                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1314                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1315                         ret = -EPERM;
1316                         break;
1317                 }
1318
1319                 __sock_set_mark(sk, val);
1320                 break;
1321         case SO_RCVMARK:
1322                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1323                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1324                         ret = -EPERM;
1325                         break;
1326                 }
1327
1328                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1329                 break;
1330
1331         case SO_RXQ_OVFL:
1332                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1333                 break;
1334
1335         case SO_WIFI_STATUS:
1336                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1337                 break;
1338
1339         case SO_PEEK_OFF:
1340                 if (sock->ops->set_peek_off)
1341                         ret = sock->ops->set_peek_off(sk, val);
1342                 else
1343                         ret = -EOPNOTSUPP;
1344                 break;
1345
1346         case SO_NOFCS:
1347                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1348                 break;
1349
1350         case SO_SELECT_ERR_QUEUE:
1351                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1352                 break;
1353
1354 #ifdef CONFIG_NET_RX_BUSY_POLL
1355         case SO_BUSY_POLL:
1356                 /* allow unprivileged users to decrease the value */
1357                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1358                         ret = -EPERM;
1359                 else {
1360                         if (val < 0)
1361                                 ret = -EINVAL;
1362                         else
1363                                 WRITE_ONCE(sk->sk_ll_usec, val);
1364                 }
1365                 break;
1366         case SO_PREFER_BUSY_POLL:
1367                 if (valbool && !capable(CAP_NET_ADMIN))
1368                         ret = -EPERM;
1369                 else
1370                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1371                 break;
1372         case SO_BUSY_POLL_BUDGET:
1373                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1374                         ret = -EPERM;
1375                 } else {
1376                         if (val < 0 || val > U16_MAX)
1377                                 ret = -EINVAL;
1378                         else
1379                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1380                 }
1381                 break;
1382 #endif
1383
1384         case SO_MAX_PACING_RATE:
1385                 {
1386                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1387
1388                 if (sizeof(ulval) != sizeof(val) &&
1389                     optlen >= sizeof(ulval) &&
1390                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1391                         ret = -EFAULT;
1392                         break;
1393                 }
1394                 if (ulval != ~0UL)
1395                         cmpxchg(&sk->sk_pacing_status,
1396                                 SK_PACING_NONE,
1397                                 SK_PACING_NEEDED);
1398                 sk->sk_max_pacing_rate = ulval;
1399                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1400                 break;
1401                 }
1402         case SO_INCOMING_CPU:
1403                 WRITE_ONCE(sk->sk_incoming_cpu, val);
1404                 break;
1405
1406         case SO_CNX_ADVICE:
1407                 if (val == 1)
1408                         dst_negative_advice(sk);
1409                 break;
1410
1411         case SO_ZEROCOPY:
1412                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1413                         if (!(sk_is_tcp(sk) ||
1414                               (sk->sk_type == SOCK_DGRAM &&
1415                                sk->sk_protocol == IPPROTO_UDP)))
1416                                 ret = -EOPNOTSUPP;
1417                 } else if (sk->sk_family != PF_RDS) {
1418                         ret = -EOPNOTSUPP;
1419                 }
1420                 if (!ret) {
1421                         if (val < 0 || val > 1)
1422                                 ret = -EINVAL;
1423                         else
1424                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1425                 }
1426                 break;
1427
1428         case SO_TXTIME:
1429                 if (optlen != sizeof(struct sock_txtime)) {
1430                         ret = -EINVAL;
1431                         break;
1432                 } else if (copy_from_sockptr(&sk_txtime, optval,
1433                            sizeof(struct sock_txtime))) {
1434                         ret = -EFAULT;
1435                         break;
1436                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1437                         ret = -EINVAL;
1438                         break;
1439                 }
1440                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1441                  * scheduler has enough safe guards.
1442                  */
1443                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1444                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1445                         ret = -EPERM;
1446                         break;
1447                 }
1448                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1449                 sk->sk_clockid = sk_txtime.clockid;
1450                 sk->sk_txtime_deadline_mode =
1451                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1452                 sk->sk_txtime_report_errors =
1453                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1454                 break;
1455
1456         case SO_BINDTOIFINDEX:
1457                 ret = sock_bindtoindex_locked(sk, val);
1458                 break;
1459
1460         case SO_BUF_LOCK:
1461                 if (val & ~SOCK_BUF_LOCK_MASK) {
1462                         ret = -EINVAL;
1463                         break;
1464                 }
1465                 sk->sk_userlocks = val | (sk->sk_userlocks &
1466                                           ~SOCK_BUF_LOCK_MASK);
1467                 break;
1468
1469         case SO_RESERVE_MEM:
1470         {
1471                 int delta;
1472
1473                 if (val < 0) {
1474                         ret = -EINVAL;
1475                         break;
1476                 }
1477
1478                 delta = val - sk->sk_reserved_mem;
1479                 if (delta < 0)
1480                         sock_release_reserved_memory(sk, -delta);
1481                 else
1482                         ret = sock_reserve_memory(sk, delta);
1483                 break;
1484         }
1485
1486         case SO_TXREHASH:
1487                 if (val < -1 || val > 1) {
1488                         ret = -EINVAL;
1489                         break;
1490                 }
1491                 /* Paired with READ_ONCE() in tcp_rtx_synack() */
1492                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1493                 break;
1494
1495         default:
1496                 ret = -ENOPROTOOPT;
1497                 break;
1498         }
1499         release_sock(sk);
1500         return ret;
1501 }
1502 EXPORT_SYMBOL(sock_setsockopt);
1503
1504 static const struct cred *sk_get_peer_cred(struct sock *sk)
1505 {
1506         const struct cred *cred;
1507
1508         spin_lock(&sk->sk_peer_lock);
1509         cred = get_cred(sk->sk_peer_cred);
1510         spin_unlock(&sk->sk_peer_lock);
1511
1512         return cred;
1513 }
1514
1515 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1516                           struct ucred *ucred)
1517 {
1518         ucred->pid = pid_vnr(pid);
1519         ucred->uid = ucred->gid = -1;
1520         if (cred) {
1521                 struct user_namespace *current_ns = current_user_ns();
1522
1523                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1524                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1525         }
1526 }
1527
1528 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1529 {
1530         struct user_namespace *user_ns = current_user_ns();
1531         int i;
1532
1533         for (i = 0; i < src->ngroups; i++)
1534                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1535                         return -EFAULT;
1536
1537         return 0;
1538 }
1539
1540 int sock_getsockopt(struct socket *sock, int level, int optname,
1541                     char __user *optval, int __user *optlen)
1542 {
1543         struct sock *sk = sock->sk;
1544
1545         union {
1546                 int val;
1547                 u64 val64;
1548                 unsigned long ulval;
1549                 struct linger ling;
1550                 struct old_timeval32 tm32;
1551                 struct __kernel_old_timeval tm;
1552                 struct  __kernel_sock_timeval stm;
1553                 struct sock_txtime txtime;
1554                 struct so_timestamping timestamping;
1555         } v;
1556
1557         int lv = sizeof(int);
1558         int len;
1559
1560         if (get_user(len, optlen))
1561                 return -EFAULT;
1562         if (len < 0)
1563                 return -EINVAL;
1564
1565         memset(&v, 0, sizeof(v));
1566
1567         switch (optname) {
1568         case SO_DEBUG:
1569                 v.val = sock_flag(sk, SOCK_DBG);
1570                 break;
1571
1572         case SO_DONTROUTE:
1573                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1574                 break;
1575
1576         case SO_BROADCAST:
1577                 v.val = sock_flag(sk, SOCK_BROADCAST);
1578                 break;
1579
1580         case SO_SNDBUF:
1581                 v.val = sk->sk_sndbuf;
1582                 break;
1583
1584         case SO_RCVBUF:
1585                 v.val = sk->sk_rcvbuf;
1586                 break;
1587
1588         case SO_REUSEADDR:
1589                 v.val = sk->sk_reuse;
1590                 break;
1591
1592         case SO_REUSEPORT:
1593                 v.val = sk->sk_reuseport;
1594                 break;
1595
1596         case SO_KEEPALIVE:
1597                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1598                 break;
1599
1600         case SO_TYPE:
1601                 v.val = sk->sk_type;
1602                 break;
1603
1604         case SO_PROTOCOL:
1605                 v.val = sk->sk_protocol;
1606                 break;
1607
1608         case SO_DOMAIN:
1609                 v.val = sk->sk_family;
1610                 break;
1611
1612         case SO_ERROR:
1613                 v.val = -sock_error(sk);
1614                 if (v.val == 0)
1615                         v.val = xchg(&sk->sk_err_soft, 0);
1616                 break;
1617
1618         case SO_OOBINLINE:
1619                 v.val = sock_flag(sk, SOCK_URGINLINE);
1620                 break;
1621
1622         case SO_NO_CHECK:
1623                 v.val = sk->sk_no_check_tx;
1624                 break;
1625
1626         case SO_PRIORITY:
1627                 v.val = sk->sk_priority;
1628                 break;
1629
1630         case SO_LINGER:
1631                 lv              = sizeof(v.ling);
1632                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1633                 v.ling.l_linger = sk->sk_lingertime / HZ;
1634                 break;
1635
1636         case SO_BSDCOMPAT:
1637                 break;
1638
1639         case SO_TIMESTAMP_OLD:
1640                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1641                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1642                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1643                 break;
1644
1645         case SO_TIMESTAMPNS_OLD:
1646                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1647                 break;
1648
1649         case SO_TIMESTAMP_NEW:
1650                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1651                 break;
1652
1653         case SO_TIMESTAMPNS_NEW:
1654                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1655                 break;
1656
1657         case SO_TIMESTAMPING_OLD:
1658                 lv = sizeof(v.timestamping);
1659                 v.timestamping.flags = sk->sk_tsflags;
1660                 v.timestamping.bind_phc = sk->sk_bind_phc;
1661                 break;
1662
1663         case SO_RCVTIMEO_OLD:
1664         case SO_RCVTIMEO_NEW:
1665                 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1666                 break;
1667
1668         case SO_SNDTIMEO_OLD:
1669         case SO_SNDTIMEO_NEW:
1670                 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1671                 break;
1672
1673         case SO_RCVLOWAT:
1674                 v.val = sk->sk_rcvlowat;
1675                 break;
1676
1677         case SO_SNDLOWAT:
1678                 v.val = 1;
1679                 break;
1680
1681         case SO_PASSCRED:
1682                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1683                 break;
1684
1685         case SO_PEERCRED:
1686         {
1687                 struct ucred peercred;
1688                 if (len > sizeof(peercred))
1689                         len = sizeof(peercred);
1690
1691                 spin_lock(&sk->sk_peer_lock);
1692                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1693                 spin_unlock(&sk->sk_peer_lock);
1694
1695                 if (copy_to_user(optval, &peercred, len))
1696                         return -EFAULT;
1697                 goto lenout;
1698         }
1699
1700         case SO_PEERGROUPS:
1701         {
1702                 const struct cred *cred;
1703                 int ret, n;
1704
1705                 cred = sk_get_peer_cred(sk);
1706                 if (!cred)
1707                         return -ENODATA;
1708
1709                 n = cred->group_info->ngroups;
1710                 if (len < n * sizeof(gid_t)) {
1711                         len = n * sizeof(gid_t);
1712                         put_cred(cred);
1713                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1714                 }
1715                 len = n * sizeof(gid_t);
1716
1717                 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1718                 put_cred(cred);
1719                 if (ret)
1720                         return ret;
1721                 goto lenout;
1722         }
1723
1724         case SO_PEERNAME:
1725         {
1726                 char address[128];
1727
1728                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1729                 if (lv < 0)
1730                         return -ENOTCONN;
1731                 if (lv < len)
1732                         return -EINVAL;
1733                 if (copy_to_user(optval, address, len))
1734                         return -EFAULT;
1735                 goto lenout;
1736         }
1737
1738         /* Dubious BSD thing... Probably nobody even uses it, but
1739          * the UNIX standard wants it for whatever reason... -DaveM
1740          */
1741         case SO_ACCEPTCONN:
1742                 v.val = sk->sk_state == TCP_LISTEN;
1743                 break;
1744
1745         case SO_PASSSEC:
1746                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1747                 break;
1748
1749         case SO_PEERSEC:
1750                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1751
1752         case SO_MARK:
1753                 v.val = sk->sk_mark;
1754                 break;
1755
1756         case SO_RCVMARK:
1757                 v.val = sock_flag(sk, SOCK_RCVMARK);
1758                 break;
1759
1760         case SO_RXQ_OVFL:
1761                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1762                 break;
1763
1764         case SO_WIFI_STATUS:
1765                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1766                 break;
1767
1768         case SO_PEEK_OFF:
1769                 if (!sock->ops->set_peek_off)
1770                         return -EOPNOTSUPP;
1771
1772                 v.val = sk->sk_peek_off;
1773                 break;
1774         case SO_NOFCS:
1775                 v.val = sock_flag(sk, SOCK_NOFCS);
1776                 break;
1777
1778         case SO_BINDTODEVICE:
1779                 return sock_getbindtodevice(sk, optval, optlen, len);
1780
1781         case SO_GET_FILTER:
1782                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1783                 if (len < 0)
1784                         return len;
1785
1786                 goto lenout;
1787
1788         case SO_LOCK_FILTER:
1789                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1790                 break;
1791
1792         case SO_BPF_EXTENSIONS:
1793                 v.val = bpf_tell_extensions();
1794                 break;
1795
1796         case SO_SELECT_ERR_QUEUE:
1797                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1798                 break;
1799
1800 #ifdef CONFIG_NET_RX_BUSY_POLL
1801         case SO_BUSY_POLL:
1802                 v.val = sk->sk_ll_usec;
1803                 break;
1804         case SO_PREFER_BUSY_POLL:
1805                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1806                 break;
1807 #endif
1808
1809         case SO_MAX_PACING_RATE:
1810                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1811                         lv = sizeof(v.ulval);
1812                         v.ulval = sk->sk_max_pacing_rate;
1813                 } else {
1814                         /* 32bit version */
1815                         v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1816                 }
1817                 break;
1818
1819         case SO_INCOMING_CPU:
1820                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1821                 break;
1822
1823         case SO_MEMINFO:
1824         {
1825                 u32 meminfo[SK_MEMINFO_VARS];
1826
1827                 sk_get_meminfo(sk, meminfo);
1828
1829                 len = min_t(unsigned int, len, sizeof(meminfo));
1830                 if (copy_to_user(optval, &meminfo, len))
1831                         return -EFAULT;
1832
1833                 goto lenout;
1834         }
1835
1836 #ifdef CONFIG_NET_RX_BUSY_POLL
1837         case SO_INCOMING_NAPI_ID:
1838                 v.val = READ_ONCE(sk->sk_napi_id);
1839
1840                 /* aggregate non-NAPI IDs down to 0 */
1841                 if (v.val < MIN_NAPI_ID)
1842                         v.val = 0;
1843
1844                 break;
1845 #endif
1846
1847         case SO_COOKIE:
1848                 lv = sizeof(u64);
1849                 if (len < lv)
1850                         return -EINVAL;
1851                 v.val64 = sock_gen_cookie(sk);
1852                 break;
1853
1854         case SO_ZEROCOPY:
1855                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1856                 break;
1857
1858         case SO_TXTIME:
1859                 lv = sizeof(v.txtime);
1860                 v.txtime.clockid = sk->sk_clockid;
1861                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1862                                   SOF_TXTIME_DEADLINE_MODE : 0;
1863                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1864                                   SOF_TXTIME_REPORT_ERRORS : 0;
1865                 break;
1866
1867         case SO_BINDTOIFINDEX:
1868                 v.val = READ_ONCE(sk->sk_bound_dev_if);
1869                 break;
1870
1871         case SO_NETNS_COOKIE:
1872                 lv = sizeof(u64);
1873                 if (len != lv)
1874                         return -EINVAL;
1875                 v.val64 = sock_net(sk)->net_cookie;
1876                 break;
1877
1878         case SO_BUF_LOCK:
1879                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1880                 break;
1881
1882         case SO_RESERVE_MEM:
1883                 v.val = sk->sk_reserved_mem;
1884                 break;
1885
1886         case SO_TXREHASH:
1887                 v.val = sk->sk_txrehash;
1888                 break;
1889
1890         default:
1891                 /* We implement the SO_SNDLOWAT etc to not be settable
1892                  * (1003.1g 7).
1893                  */
1894                 return -ENOPROTOOPT;
1895         }
1896
1897         if (len > lv)
1898                 len = lv;
1899         if (copy_to_user(optval, &v, len))
1900                 return -EFAULT;
1901 lenout:
1902         if (put_user(len, optlen))
1903                 return -EFAULT;
1904         return 0;
1905 }
1906
1907 /*
1908  * Initialize an sk_lock.
1909  *
1910  * (We also register the sk_lock with the lock validator.)
1911  */
1912 static inline void sock_lock_init(struct sock *sk)
1913 {
1914         if (sk->sk_kern_sock)
1915                 sock_lock_init_class_and_name(
1916                         sk,
1917                         af_family_kern_slock_key_strings[sk->sk_family],
1918                         af_family_kern_slock_keys + sk->sk_family,
1919                         af_family_kern_key_strings[sk->sk_family],
1920                         af_family_kern_keys + sk->sk_family);
1921         else
1922                 sock_lock_init_class_and_name(
1923                         sk,
1924                         af_family_slock_key_strings[sk->sk_family],
1925                         af_family_slock_keys + sk->sk_family,
1926                         af_family_key_strings[sk->sk_family],
1927                         af_family_keys + sk->sk_family);
1928 }
1929
1930 /*
1931  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1932  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1933  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1934  */
1935 static void sock_copy(struct sock *nsk, const struct sock *osk)
1936 {
1937         const struct proto *prot = READ_ONCE(osk->sk_prot);
1938 #ifdef CONFIG_SECURITY_NETWORK
1939         void *sptr = nsk->sk_security;
1940 #endif
1941
1942         /* If we move sk_tx_queue_mapping out of the private section,
1943          * we must check if sk_tx_queue_clear() is called after
1944          * sock_copy() in sk_clone_lock().
1945          */
1946         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1947                      offsetof(struct sock, sk_dontcopy_begin) ||
1948                      offsetof(struct sock, sk_tx_queue_mapping) >=
1949                      offsetof(struct sock, sk_dontcopy_end));
1950
1951         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1952
1953         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1954                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1955
1956 #ifdef CONFIG_SECURITY_NETWORK
1957         nsk->sk_security = sptr;
1958         security_sk_clone(osk, nsk);
1959 #endif
1960 }
1961
1962 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1963                 int family)
1964 {
1965         struct sock *sk;
1966         struct kmem_cache *slab;
1967
1968         slab = prot->slab;
1969         if (slab != NULL) {
1970                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1971                 if (!sk)
1972                         return sk;
1973                 if (want_init_on_alloc(priority))
1974                         sk_prot_clear_nulls(sk, prot->obj_size);
1975         } else
1976                 sk = kmalloc(prot->obj_size, priority);
1977
1978         if (sk != NULL) {
1979                 if (security_sk_alloc(sk, family, priority))
1980                         goto out_free;
1981
1982                 if (!try_module_get(prot->owner))
1983                         goto out_free_sec;
1984         }
1985
1986         return sk;
1987
1988 out_free_sec:
1989         security_sk_free(sk);
1990 out_free:
1991         if (slab != NULL)
1992                 kmem_cache_free(slab, sk);
1993         else
1994                 kfree(sk);
1995         return NULL;
1996 }
1997
1998 static void sk_prot_free(struct proto *prot, struct sock *sk)
1999 {
2000         struct kmem_cache *slab;
2001         struct module *owner;
2002
2003         owner = prot->owner;
2004         slab = prot->slab;
2005
2006         cgroup_sk_free(&sk->sk_cgrp_data);
2007         mem_cgroup_sk_free(sk);
2008         security_sk_free(sk);
2009         if (slab != NULL)
2010                 kmem_cache_free(slab, sk);
2011         else
2012                 kfree(sk);
2013         module_put(owner);
2014 }
2015
2016 /**
2017  *      sk_alloc - All socket objects are allocated here
2018  *      @net: the applicable net namespace
2019  *      @family: protocol family
2020  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2021  *      @prot: struct proto associated with this new sock instance
2022  *      @kern: is this to be a kernel socket?
2023  */
2024 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2025                       struct proto *prot, int kern)
2026 {
2027         struct sock *sk;
2028
2029         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2030         if (sk) {
2031                 sk->sk_family = family;
2032                 /*
2033                  * See comment in struct sock definition to understand
2034                  * why we need sk_prot_creator -acme
2035                  */
2036                 sk->sk_prot = sk->sk_prot_creator = prot;
2037                 sk->sk_kern_sock = kern;
2038                 sock_lock_init(sk);
2039                 sk->sk_net_refcnt = kern ? 0 : 1;
2040                 if (likely(sk->sk_net_refcnt)) {
2041                         get_net_track(net, &sk->ns_tracker, priority);
2042                         sock_inuse_add(net, 1);
2043                 }
2044
2045                 sock_net_set(sk, net);
2046                 refcount_set(&sk->sk_wmem_alloc, 1);
2047
2048                 mem_cgroup_sk_alloc(sk);
2049                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2050                 sock_update_classid(&sk->sk_cgrp_data);
2051                 sock_update_netprioidx(&sk->sk_cgrp_data);
2052                 sk_tx_queue_clear(sk);
2053         }
2054
2055         return sk;
2056 }
2057 EXPORT_SYMBOL(sk_alloc);
2058
2059 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2060  * grace period. This is the case for UDP sockets and TCP listeners.
2061  */
2062 static void __sk_destruct(struct rcu_head *head)
2063 {
2064         struct sock *sk = container_of(head, struct sock, sk_rcu);
2065         struct sk_filter *filter;
2066
2067         if (sk->sk_destruct)
2068                 sk->sk_destruct(sk);
2069
2070         filter = rcu_dereference_check(sk->sk_filter,
2071                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2072         if (filter) {
2073                 sk_filter_uncharge(sk, filter);
2074                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2075         }
2076
2077         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2078
2079 #ifdef CONFIG_BPF_SYSCALL
2080         bpf_sk_storage_free(sk);
2081 #endif
2082
2083         if (atomic_read(&sk->sk_omem_alloc))
2084                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2085                          __func__, atomic_read(&sk->sk_omem_alloc));
2086
2087         if (sk->sk_frag.page) {
2088                 put_page(sk->sk_frag.page);
2089                 sk->sk_frag.page = NULL;
2090         }
2091
2092         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2093         put_cred(sk->sk_peer_cred);
2094         put_pid(sk->sk_peer_pid);
2095
2096         if (likely(sk->sk_net_refcnt))
2097                 put_net_track(sock_net(sk), &sk->ns_tracker);
2098         sk_prot_free(sk->sk_prot_creator, sk);
2099 }
2100
2101 void sk_destruct(struct sock *sk)
2102 {
2103         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2104
2105         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2106                 reuseport_detach_sock(sk);
2107                 use_call_rcu = true;
2108         }
2109
2110         if (use_call_rcu)
2111                 call_rcu(&sk->sk_rcu, __sk_destruct);
2112         else
2113                 __sk_destruct(&sk->sk_rcu);
2114 }
2115
2116 static void __sk_free(struct sock *sk)
2117 {
2118         if (likely(sk->sk_net_refcnt))
2119                 sock_inuse_add(sock_net(sk), -1);
2120
2121         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2122                 sock_diag_broadcast_destroy(sk);
2123         else
2124                 sk_destruct(sk);
2125 }
2126
2127 void sk_free(struct sock *sk)
2128 {
2129         /*
2130          * We subtract one from sk_wmem_alloc and can know if
2131          * some packets are still in some tx queue.
2132          * If not null, sock_wfree() will call __sk_free(sk) later
2133          */
2134         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2135                 __sk_free(sk);
2136 }
2137 EXPORT_SYMBOL(sk_free);
2138
2139 static void sk_init_common(struct sock *sk)
2140 {
2141         skb_queue_head_init(&sk->sk_receive_queue);
2142         skb_queue_head_init(&sk->sk_write_queue);
2143         skb_queue_head_init(&sk->sk_error_queue);
2144
2145         rwlock_init(&sk->sk_callback_lock);
2146         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2147                         af_rlock_keys + sk->sk_family,
2148                         af_family_rlock_key_strings[sk->sk_family]);
2149         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2150                         af_wlock_keys + sk->sk_family,
2151                         af_family_wlock_key_strings[sk->sk_family]);
2152         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2153                         af_elock_keys + sk->sk_family,
2154                         af_family_elock_key_strings[sk->sk_family]);
2155         lockdep_set_class_and_name(&sk->sk_callback_lock,
2156                         af_callback_keys + sk->sk_family,
2157                         af_family_clock_key_strings[sk->sk_family]);
2158 }
2159
2160 /**
2161  *      sk_clone_lock - clone a socket, and lock its clone
2162  *      @sk: the socket to clone
2163  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2164  *
2165  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2166  */
2167 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2168 {
2169         struct proto *prot = READ_ONCE(sk->sk_prot);
2170         struct sk_filter *filter;
2171         bool is_charged = true;
2172         struct sock *newsk;
2173
2174         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2175         if (!newsk)
2176                 goto out;
2177
2178         sock_copy(newsk, sk);
2179
2180         newsk->sk_prot_creator = prot;
2181
2182         /* SANITY */
2183         if (likely(newsk->sk_net_refcnt)) {
2184                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2185                 sock_inuse_add(sock_net(newsk), 1);
2186         }
2187         sk_node_init(&newsk->sk_node);
2188         sock_lock_init(newsk);
2189         bh_lock_sock(newsk);
2190         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2191         newsk->sk_backlog.len = 0;
2192
2193         atomic_set(&newsk->sk_rmem_alloc, 0);
2194
2195         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2196         refcount_set(&newsk->sk_wmem_alloc, 1);
2197
2198         atomic_set(&newsk->sk_omem_alloc, 0);
2199         sk_init_common(newsk);
2200
2201         newsk->sk_dst_cache     = NULL;
2202         newsk->sk_dst_pending_confirm = 0;
2203         newsk->sk_wmem_queued   = 0;
2204         newsk->sk_forward_alloc = 0;
2205         newsk->sk_reserved_mem  = 0;
2206         atomic_set(&newsk->sk_drops, 0);
2207         newsk->sk_send_head     = NULL;
2208         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2209         atomic_set(&newsk->sk_zckey, 0);
2210
2211         sock_reset_flag(newsk, SOCK_DONE);
2212
2213         /* sk->sk_memcg will be populated at accept() time */
2214         newsk->sk_memcg = NULL;
2215
2216         cgroup_sk_clone(&newsk->sk_cgrp_data);
2217
2218         rcu_read_lock();
2219         filter = rcu_dereference(sk->sk_filter);
2220         if (filter != NULL)
2221                 /* though it's an empty new sock, the charging may fail
2222                  * if sysctl_optmem_max was changed between creation of
2223                  * original socket and cloning
2224                  */
2225                 is_charged = sk_filter_charge(newsk, filter);
2226         RCU_INIT_POINTER(newsk->sk_filter, filter);
2227         rcu_read_unlock();
2228
2229         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2230                 /* We need to make sure that we don't uncharge the new
2231                  * socket if we couldn't charge it in the first place
2232                  * as otherwise we uncharge the parent's filter.
2233                  */
2234                 if (!is_charged)
2235                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2236                 sk_free_unlock_clone(newsk);
2237                 newsk = NULL;
2238                 goto out;
2239         }
2240         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2241
2242         if (bpf_sk_storage_clone(sk, newsk)) {
2243                 sk_free_unlock_clone(newsk);
2244                 newsk = NULL;
2245                 goto out;
2246         }
2247
2248         /* Clear sk_user_data if parent had the pointer tagged
2249          * as not suitable for copying when cloning.
2250          */
2251         if (sk_user_data_is_nocopy(newsk))
2252                 newsk->sk_user_data = NULL;
2253
2254         newsk->sk_err      = 0;
2255         newsk->sk_err_soft = 0;
2256         newsk->sk_priority = 0;
2257         newsk->sk_incoming_cpu = raw_smp_processor_id();
2258
2259         /* Before updating sk_refcnt, we must commit prior changes to memory
2260          * (Documentation/RCU/rculist_nulls.rst for details)
2261          */
2262         smp_wmb();
2263         refcount_set(&newsk->sk_refcnt, 2);
2264
2265         /* Increment the counter in the same struct proto as the master
2266          * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2267          * is the same as sk->sk_prot->socks, as this field was copied
2268          * with memcpy).
2269          *
2270          * This _changes_ the previous behaviour, where
2271          * tcp_create_openreq_child always was incrementing the
2272          * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2273          * to be taken into account in all callers. -acme
2274          */
2275         sk_refcnt_debug_inc(newsk);
2276         sk_set_socket(newsk, NULL);
2277         sk_tx_queue_clear(newsk);
2278         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2279
2280         if (newsk->sk_prot->sockets_allocated)
2281                 sk_sockets_allocated_inc(newsk);
2282
2283         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2284                 net_enable_timestamp();
2285 out:
2286         return newsk;
2287 }
2288 EXPORT_SYMBOL_GPL(sk_clone_lock);
2289
2290 void sk_free_unlock_clone(struct sock *sk)
2291 {
2292         /* It is still raw copy of parent, so invalidate
2293          * destructor and make plain sk_free() */
2294         sk->sk_destruct = NULL;
2295         bh_unlock_sock(sk);
2296         sk_free(sk);
2297 }
2298 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2299
2300 static void sk_trim_gso_size(struct sock *sk)
2301 {
2302         if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
2303                 return;
2304 #if IS_ENABLED(CONFIG_IPV6)
2305         if (sk->sk_family == AF_INET6 &&
2306             sk_is_tcp(sk) &&
2307             !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
2308                 return;
2309 #endif
2310         sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
2311 }
2312
2313 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2314 {
2315         u32 max_segs = 1;
2316
2317         sk_dst_set(sk, dst);
2318         sk->sk_route_caps = dst->dev->features;
2319         if (sk_is_tcp(sk))
2320                 sk->sk_route_caps |= NETIF_F_GSO;
2321         if (sk->sk_route_caps & NETIF_F_GSO)
2322                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2323         if (unlikely(sk->sk_gso_disabled))
2324                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2325         if (sk_can_gso(sk)) {
2326                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2327                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2328                 } else {
2329                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2330                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2331                         sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2332                         sk_trim_gso_size(sk);
2333                         sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2334                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2335                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2336                 }
2337         }
2338         sk->sk_gso_max_segs = max_segs;
2339 }
2340 EXPORT_SYMBOL_GPL(sk_setup_caps);
2341
2342 /*
2343  *      Simple resource managers for sockets.
2344  */
2345
2346
2347 /*
2348  * Write buffer destructor automatically called from kfree_skb.
2349  */
2350 void sock_wfree(struct sk_buff *skb)
2351 {
2352         struct sock *sk = skb->sk;
2353         unsigned int len = skb->truesize;
2354         bool free;
2355
2356         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2357                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2358                     sk->sk_write_space == sock_def_write_space) {
2359                         rcu_read_lock();
2360                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2361                         sock_def_write_space_wfree(sk);
2362                         rcu_read_unlock();
2363                         if (unlikely(free))
2364                                 __sk_free(sk);
2365                         return;
2366                 }
2367
2368                 /*
2369                  * Keep a reference on sk_wmem_alloc, this will be released
2370                  * after sk_write_space() call
2371                  */
2372                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2373                 sk->sk_write_space(sk);
2374                 len = 1;
2375         }
2376         /*
2377          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2378          * could not do because of in-flight packets
2379          */
2380         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2381                 __sk_free(sk);
2382 }
2383 EXPORT_SYMBOL(sock_wfree);
2384
2385 /* This variant of sock_wfree() is used by TCP,
2386  * since it sets SOCK_USE_WRITE_QUEUE.
2387  */
2388 void __sock_wfree(struct sk_buff *skb)
2389 {
2390         struct sock *sk = skb->sk;
2391
2392         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2393                 __sk_free(sk);
2394 }
2395
2396 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2397 {
2398         skb_orphan(skb);
2399         skb->sk = sk;
2400 #ifdef CONFIG_INET
2401         if (unlikely(!sk_fullsock(sk))) {
2402                 skb->destructor = sock_edemux;
2403                 sock_hold(sk);
2404                 return;
2405         }
2406 #endif
2407         skb->destructor = sock_wfree;
2408         skb_set_hash_from_sk(skb, sk);
2409         /*
2410          * We used to take a refcount on sk, but following operation
2411          * is enough to guarantee sk_free() wont free this sock until
2412          * all in-flight packets are completed
2413          */
2414         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2415 }
2416 EXPORT_SYMBOL(skb_set_owner_w);
2417
2418 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2419 {
2420 #ifdef CONFIG_TLS_DEVICE
2421         /* Drivers depend on in-order delivery for crypto offload,
2422          * partial orphan breaks out-of-order-OK logic.
2423          */
2424         if (skb->decrypted)
2425                 return false;
2426 #endif
2427         return (skb->destructor == sock_wfree ||
2428                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2429 }
2430
2431 /* This helper is used by netem, as it can hold packets in its
2432  * delay queue. We want to allow the owner socket to send more
2433  * packets, as if they were already TX completed by a typical driver.
2434  * But we also want to keep skb->sk set because some packet schedulers
2435  * rely on it (sch_fq for example).
2436  */
2437 void skb_orphan_partial(struct sk_buff *skb)
2438 {
2439         if (skb_is_tcp_pure_ack(skb))
2440                 return;
2441
2442         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2443                 return;
2444
2445         skb_orphan(skb);
2446 }
2447 EXPORT_SYMBOL(skb_orphan_partial);
2448
2449 /*
2450  * Read buffer destructor automatically called from kfree_skb.
2451  */
2452 void sock_rfree(struct sk_buff *skb)
2453 {
2454         struct sock *sk = skb->sk;
2455         unsigned int len = skb->truesize;
2456
2457         atomic_sub(len, &sk->sk_rmem_alloc);
2458         sk_mem_uncharge(sk, len);
2459 }
2460 EXPORT_SYMBOL(sock_rfree);
2461
2462 /*
2463  * Buffer destructor for skbs that are not used directly in read or write
2464  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2465  */
2466 void sock_efree(struct sk_buff *skb)
2467 {
2468         sock_put(skb->sk);
2469 }
2470 EXPORT_SYMBOL(sock_efree);
2471
2472 /* Buffer destructor for prefetch/receive path where reference count may
2473  * not be held, e.g. for listen sockets.
2474  */
2475 #ifdef CONFIG_INET
2476 void sock_pfree(struct sk_buff *skb)
2477 {
2478         if (sk_is_refcounted(skb->sk))
2479                 sock_gen_put(skb->sk);
2480 }
2481 EXPORT_SYMBOL(sock_pfree);
2482 #endif /* CONFIG_INET */
2483
2484 kuid_t sock_i_uid(struct sock *sk)
2485 {
2486         kuid_t uid;
2487
2488         read_lock_bh(&sk->sk_callback_lock);
2489         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2490         read_unlock_bh(&sk->sk_callback_lock);
2491         return uid;
2492 }
2493 EXPORT_SYMBOL(sock_i_uid);
2494
2495 unsigned long sock_i_ino(struct sock *sk)
2496 {
2497         unsigned long ino;
2498
2499         read_lock_bh(&sk->sk_callback_lock);
2500         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2501         read_unlock_bh(&sk->sk_callback_lock);
2502         return ino;
2503 }
2504 EXPORT_SYMBOL(sock_i_ino);
2505
2506 /*
2507  * Allocate a skb from the socket's send buffer.
2508  */
2509 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2510                              gfp_t priority)
2511 {
2512         if (force ||
2513             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2514                 struct sk_buff *skb = alloc_skb(size, priority);
2515
2516                 if (skb) {
2517                         skb_set_owner_w(skb, sk);
2518                         return skb;
2519                 }
2520         }
2521         return NULL;
2522 }
2523 EXPORT_SYMBOL(sock_wmalloc);
2524
2525 static void sock_ofree(struct sk_buff *skb)
2526 {
2527         struct sock *sk = skb->sk;
2528
2529         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2530 }
2531
2532 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2533                              gfp_t priority)
2534 {
2535         struct sk_buff *skb;
2536
2537         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2538         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2539             READ_ONCE(sysctl_optmem_max))
2540                 return NULL;
2541
2542         skb = alloc_skb(size, priority);
2543         if (!skb)
2544                 return NULL;
2545
2546         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2547         skb->sk = sk;
2548         skb->destructor = sock_ofree;
2549         return skb;
2550 }
2551
2552 /*
2553  * Allocate a memory block from the socket's option memory buffer.
2554  */
2555 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2556 {
2557         int optmem_max = READ_ONCE(sysctl_optmem_max);
2558
2559         if ((unsigned int)size <= optmem_max &&
2560             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2561                 void *mem;
2562                 /* First do the add, to avoid the race if kmalloc
2563                  * might sleep.
2564                  */
2565                 atomic_add(size, &sk->sk_omem_alloc);
2566                 mem = kmalloc(size, priority);
2567                 if (mem)
2568                         return mem;
2569                 atomic_sub(size, &sk->sk_omem_alloc);
2570         }
2571         return NULL;
2572 }
2573 EXPORT_SYMBOL(sock_kmalloc);
2574
2575 /* Free an option memory block. Note, we actually want the inline
2576  * here as this allows gcc to detect the nullify and fold away the
2577  * condition entirely.
2578  */
2579 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2580                                   const bool nullify)
2581 {
2582         if (WARN_ON_ONCE(!mem))
2583                 return;
2584         if (nullify)
2585                 kfree_sensitive(mem);
2586         else
2587                 kfree(mem);
2588         atomic_sub(size, &sk->sk_omem_alloc);
2589 }
2590
2591 void sock_kfree_s(struct sock *sk, void *mem, int size)
2592 {
2593         __sock_kfree_s(sk, mem, size, false);
2594 }
2595 EXPORT_SYMBOL(sock_kfree_s);
2596
2597 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2598 {
2599         __sock_kfree_s(sk, mem, size, true);
2600 }
2601 EXPORT_SYMBOL(sock_kzfree_s);
2602
2603 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2604    I think, these locks should be removed for datagram sockets.
2605  */
2606 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2607 {
2608         DEFINE_WAIT(wait);
2609
2610         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2611         for (;;) {
2612                 if (!timeo)
2613                         break;
2614                 if (signal_pending(current))
2615                         break;
2616                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2617                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2618                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2619                         break;
2620                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2621                         break;
2622                 if (sk->sk_err)
2623                         break;
2624                 timeo = schedule_timeout(timeo);
2625         }
2626         finish_wait(sk_sleep(sk), &wait);
2627         return timeo;
2628 }
2629
2630
2631 /*
2632  *      Generic send/receive buffer handlers
2633  */
2634
2635 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2636                                      unsigned long data_len, int noblock,
2637                                      int *errcode, int max_page_order)
2638 {
2639         struct sk_buff *skb;
2640         long timeo;
2641         int err;
2642
2643         timeo = sock_sndtimeo(sk, noblock);
2644         for (;;) {
2645                 err = sock_error(sk);
2646                 if (err != 0)
2647                         goto failure;
2648
2649                 err = -EPIPE;
2650                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2651                         goto failure;
2652
2653                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2654                         break;
2655
2656                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2657                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2658                 err = -EAGAIN;
2659                 if (!timeo)
2660                         goto failure;
2661                 if (signal_pending(current))
2662                         goto interrupted;
2663                 timeo = sock_wait_for_wmem(sk, timeo);
2664         }
2665         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2666                                    errcode, sk->sk_allocation);
2667         if (skb)
2668                 skb_set_owner_w(skb, sk);
2669         return skb;
2670
2671 interrupted:
2672         err = sock_intr_errno(timeo);
2673 failure:
2674         *errcode = err;
2675         return NULL;
2676 }
2677 EXPORT_SYMBOL(sock_alloc_send_pskb);
2678
2679 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2680                      struct sockcm_cookie *sockc)
2681 {
2682         u32 tsflags;
2683
2684         switch (cmsg->cmsg_type) {
2685         case SO_MARK:
2686                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2687                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2688                         return -EPERM;
2689                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2690                         return -EINVAL;
2691                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2692                 break;
2693         case SO_TIMESTAMPING_OLD:
2694                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2695                         return -EINVAL;
2696
2697                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2698                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2699                         return -EINVAL;
2700
2701                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2702                 sockc->tsflags |= tsflags;
2703                 break;
2704         case SCM_TXTIME:
2705                 if (!sock_flag(sk, SOCK_TXTIME))
2706                         return -EINVAL;
2707                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2708                         return -EINVAL;
2709                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2710                 break;
2711         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2712         case SCM_RIGHTS:
2713         case SCM_CREDENTIALS:
2714                 break;
2715         default:
2716                 return -EINVAL;
2717         }
2718         return 0;
2719 }
2720 EXPORT_SYMBOL(__sock_cmsg_send);
2721
2722 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2723                    struct sockcm_cookie *sockc)
2724 {
2725         struct cmsghdr *cmsg;
2726         int ret;
2727
2728         for_each_cmsghdr(cmsg, msg) {
2729                 if (!CMSG_OK(msg, cmsg))
2730                         return -EINVAL;
2731                 if (cmsg->cmsg_level != SOL_SOCKET)
2732                         continue;
2733                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2734                 if (ret)
2735                         return ret;
2736         }
2737         return 0;
2738 }
2739 EXPORT_SYMBOL(sock_cmsg_send);
2740
2741 static void sk_enter_memory_pressure(struct sock *sk)
2742 {
2743         if (!sk->sk_prot->enter_memory_pressure)
2744                 return;
2745
2746         sk->sk_prot->enter_memory_pressure(sk);
2747 }
2748
2749 static void sk_leave_memory_pressure(struct sock *sk)
2750 {
2751         if (sk->sk_prot->leave_memory_pressure) {
2752                 sk->sk_prot->leave_memory_pressure(sk);
2753         } else {
2754                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2755
2756                 if (memory_pressure && READ_ONCE(*memory_pressure))
2757                         WRITE_ONCE(*memory_pressure, 0);
2758         }
2759 }
2760
2761 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2762
2763 /**
2764  * skb_page_frag_refill - check that a page_frag contains enough room
2765  * @sz: minimum size of the fragment we want to get
2766  * @pfrag: pointer to page_frag
2767  * @gfp: priority for memory allocation
2768  *
2769  * Note: While this allocator tries to use high order pages, there is
2770  * no guarantee that allocations succeed. Therefore, @sz MUST be
2771  * less or equal than PAGE_SIZE.
2772  */
2773 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2774 {
2775         if (pfrag->page) {
2776                 if (page_ref_count(pfrag->page) == 1) {
2777                         pfrag->offset = 0;
2778                         return true;
2779                 }
2780                 if (pfrag->offset + sz <= pfrag->size)
2781                         return true;
2782                 put_page(pfrag->page);
2783         }
2784
2785         pfrag->offset = 0;
2786         if (SKB_FRAG_PAGE_ORDER &&
2787             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2788                 /* Avoid direct reclaim but allow kswapd to wake */
2789                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2790                                           __GFP_COMP | __GFP_NOWARN |
2791                                           __GFP_NORETRY,
2792                                           SKB_FRAG_PAGE_ORDER);
2793                 if (likely(pfrag->page)) {
2794                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2795                         return true;
2796                 }
2797         }
2798         pfrag->page = alloc_page(gfp);
2799         if (likely(pfrag->page)) {
2800                 pfrag->size = PAGE_SIZE;
2801                 return true;
2802         }
2803         return false;
2804 }
2805 EXPORT_SYMBOL(skb_page_frag_refill);
2806
2807 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2808 {
2809         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2810                 return true;
2811
2812         sk_enter_memory_pressure(sk);
2813         sk_stream_moderate_sndbuf(sk);
2814         return false;
2815 }
2816 EXPORT_SYMBOL(sk_page_frag_refill);
2817
2818 void __lock_sock(struct sock *sk)
2819         __releases(&sk->sk_lock.slock)
2820         __acquires(&sk->sk_lock.slock)
2821 {
2822         DEFINE_WAIT(wait);
2823
2824         for (;;) {
2825                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2826                                         TASK_UNINTERRUPTIBLE);
2827                 spin_unlock_bh(&sk->sk_lock.slock);
2828                 schedule();
2829                 spin_lock_bh(&sk->sk_lock.slock);
2830                 if (!sock_owned_by_user(sk))
2831                         break;
2832         }
2833         finish_wait(&sk->sk_lock.wq, &wait);
2834 }
2835
2836 void __release_sock(struct sock *sk)
2837         __releases(&sk->sk_lock.slock)
2838         __acquires(&sk->sk_lock.slock)
2839 {
2840         struct sk_buff *skb, *next;
2841
2842         while ((skb = sk->sk_backlog.head) != NULL) {
2843                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2844
2845                 spin_unlock_bh(&sk->sk_lock.slock);
2846
2847                 do {
2848                         next = skb->next;
2849                         prefetch(next);
2850                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2851                         skb_mark_not_on_list(skb);
2852                         sk_backlog_rcv(sk, skb);
2853
2854                         cond_resched();
2855
2856                         skb = next;
2857                 } while (skb != NULL);
2858
2859                 spin_lock_bh(&sk->sk_lock.slock);
2860         }
2861
2862         /*
2863          * Doing the zeroing here guarantee we can not loop forever
2864          * while a wild producer attempts to flood us.
2865          */
2866         sk->sk_backlog.len = 0;
2867 }
2868
2869 void __sk_flush_backlog(struct sock *sk)
2870 {
2871         spin_lock_bh(&sk->sk_lock.slock);
2872         __release_sock(sk);
2873         spin_unlock_bh(&sk->sk_lock.slock);
2874 }
2875 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2876
2877 /**
2878  * sk_wait_data - wait for data to arrive at sk_receive_queue
2879  * @sk:    sock to wait on
2880  * @timeo: for how long
2881  * @skb:   last skb seen on sk_receive_queue
2882  *
2883  * Now socket state including sk->sk_err is changed only under lock,
2884  * hence we may omit checks after joining wait queue.
2885  * We check receive queue before schedule() only as optimization;
2886  * it is very likely that release_sock() added new data.
2887  */
2888 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2889 {
2890         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2891         int rc;
2892
2893         add_wait_queue(sk_sleep(sk), &wait);
2894         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2895         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2896         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2897         remove_wait_queue(sk_sleep(sk), &wait);
2898         return rc;
2899 }
2900 EXPORT_SYMBOL(sk_wait_data);
2901
2902 /**
2903  *      __sk_mem_raise_allocated - increase memory_allocated
2904  *      @sk: socket
2905  *      @size: memory size to allocate
2906  *      @amt: pages to allocate
2907  *      @kind: allocation type
2908  *
2909  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2910  */
2911 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2912 {
2913         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2914         struct proto *prot = sk->sk_prot;
2915         bool charged = true;
2916         long allocated;
2917
2918         sk_memory_allocated_add(sk, amt);
2919         allocated = sk_memory_allocated(sk);
2920         if (memcg_charge &&
2921             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2922                                                 gfp_memcg_charge())))
2923                 goto suppress_allocation;
2924
2925         /* Under limit. */
2926         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2927                 sk_leave_memory_pressure(sk);
2928                 return 1;
2929         }
2930
2931         /* Under pressure. */
2932         if (allocated > sk_prot_mem_limits(sk, 1))
2933                 sk_enter_memory_pressure(sk);
2934
2935         /* Over hard limit. */
2936         if (allocated > sk_prot_mem_limits(sk, 2))
2937                 goto suppress_allocation;
2938
2939         /* guarantee minimum buffer size under pressure */
2940         if (kind == SK_MEM_RECV) {
2941                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2942                         return 1;
2943
2944         } else { /* SK_MEM_SEND */
2945                 int wmem0 = sk_get_wmem0(sk, prot);
2946
2947                 if (sk->sk_type == SOCK_STREAM) {
2948                         if (sk->sk_wmem_queued < wmem0)
2949                                 return 1;
2950                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2951                                 return 1;
2952                 }
2953         }
2954
2955         if (sk_has_memory_pressure(sk)) {
2956                 u64 alloc;
2957
2958                 if (!sk_under_memory_pressure(sk))
2959                         return 1;
2960                 alloc = sk_sockets_allocated_read_positive(sk);
2961                 if (sk_prot_mem_limits(sk, 2) > alloc *
2962                     sk_mem_pages(sk->sk_wmem_queued +
2963                                  atomic_read(&sk->sk_rmem_alloc) +
2964                                  sk->sk_forward_alloc))
2965                         return 1;
2966         }
2967
2968 suppress_allocation:
2969
2970         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2971                 sk_stream_moderate_sndbuf(sk);
2972
2973                 /* Fail only if socket is _under_ its sndbuf.
2974                  * In this case we cannot block, so that we have to fail.
2975                  */
2976                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2977                         /* Force charge with __GFP_NOFAIL */
2978                         if (memcg_charge && !charged) {
2979                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2980                                         gfp_memcg_charge() | __GFP_NOFAIL);
2981                         }
2982                         return 1;
2983                 }
2984         }
2985
2986         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2987                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2988
2989         sk_memory_allocated_sub(sk, amt);
2990
2991         if (memcg_charge && charged)
2992                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2993
2994         return 0;
2995 }
2996
2997 /**
2998  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2999  *      @sk: socket
3000  *      @size: memory size to allocate
3001  *      @kind: allocation type
3002  *
3003  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3004  *      rmem allocation. This function assumes that protocols which have
3005  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3006  */
3007 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3008 {
3009         int ret, amt = sk_mem_pages(size);
3010
3011         sk->sk_forward_alloc += amt << PAGE_SHIFT;
3012         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3013         if (!ret)
3014                 sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3015         return ret;
3016 }
3017 EXPORT_SYMBOL(__sk_mem_schedule);
3018
3019 /**
3020  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3021  *      @sk: socket
3022  *      @amount: number of quanta
3023  *
3024  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3025  */
3026 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3027 {
3028         sk_memory_allocated_sub(sk, amount);
3029
3030         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3031                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3032
3033         if (sk_under_memory_pressure(sk) &&
3034             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3035                 sk_leave_memory_pressure(sk);
3036 }
3037
3038 /**
3039  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3040  *      @sk: socket
3041  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3042  */
3043 void __sk_mem_reclaim(struct sock *sk, int amount)
3044 {
3045         amount >>= PAGE_SHIFT;
3046         sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3047         __sk_mem_reduce_allocated(sk, amount);
3048 }
3049 EXPORT_SYMBOL(__sk_mem_reclaim);
3050
3051 int sk_set_peek_off(struct sock *sk, int val)
3052 {
3053         sk->sk_peek_off = val;
3054         return 0;
3055 }
3056 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3057
3058 /*
3059  * Set of default routines for initialising struct proto_ops when
3060  * the protocol does not support a particular function. In certain
3061  * cases where it makes no sense for a protocol to have a "do nothing"
3062  * function, some default processing is provided.
3063  */
3064
3065 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3066 {
3067         return -EOPNOTSUPP;
3068 }
3069 EXPORT_SYMBOL(sock_no_bind);
3070
3071 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3072                     int len, int flags)
3073 {
3074         return -EOPNOTSUPP;
3075 }
3076 EXPORT_SYMBOL(sock_no_connect);
3077
3078 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3079 {
3080         return -EOPNOTSUPP;
3081 }
3082 EXPORT_SYMBOL(sock_no_socketpair);
3083
3084 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3085                    bool kern)
3086 {
3087         return -EOPNOTSUPP;
3088 }
3089 EXPORT_SYMBOL(sock_no_accept);
3090
3091 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3092                     int peer)
3093 {
3094         return -EOPNOTSUPP;
3095 }
3096 EXPORT_SYMBOL(sock_no_getname);
3097
3098 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3099 {
3100         return -EOPNOTSUPP;
3101 }
3102 EXPORT_SYMBOL(sock_no_ioctl);
3103
3104 int sock_no_listen(struct socket *sock, int backlog)
3105 {
3106         return -EOPNOTSUPP;
3107 }
3108 EXPORT_SYMBOL(sock_no_listen);
3109
3110 int sock_no_shutdown(struct socket *sock, int how)
3111 {
3112         return -EOPNOTSUPP;
3113 }
3114 EXPORT_SYMBOL(sock_no_shutdown);
3115
3116 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3117 {
3118         return -EOPNOTSUPP;
3119 }
3120 EXPORT_SYMBOL(sock_no_sendmsg);
3121
3122 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3123 {
3124         return -EOPNOTSUPP;
3125 }
3126 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3127
3128 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3129                     int flags)
3130 {
3131         return -EOPNOTSUPP;
3132 }
3133 EXPORT_SYMBOL(sock_no_recvmsg);
3134
3135 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3136 {
3137         /* Mirror missing mmap method error code */
3138         return -ENODEV;
3139 }
3140 EXPORT_SYMBOL(sock_no_mmap);
3141
3142 /*
3143  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3144  * various sock-based usage counts.
3145  */
3146 void __receive_sock(struct file *file)
3147 {
3148         struct socket *sock;
3149
3150         sock = sock_from_file(file);
3151         if (sock) {
3152                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3153                 sock_update_classid(&sock->sk->sk_cgrp_data);
3154         }
3155 }
3156
3157 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3158 {
3159         ssize_t res;
3160         struct msghdr msg = {.msg_flags = flags};
3161         struct kvec iov;
3162         char *kaddr = kmap(page);
3163         iov.iov_base = kaddr + offset;
3164         iov.iov_len = size;
3165         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3166         kunmap(page);
3167         return res;
3168 }
3169 EXPORT_SYMBOL(sock_no_sendpage);
3170
3171 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3172                                 int offset, size_t size, int flags)
3173 {
3174         ssize_t res;
3175         struct msghdr msg = {.msg_flags = flags};
3176         struct kvec iov;
3177         char *kaddr = kmap(page);
3178
3179         iov.iov_base = kaddr + offset;
3180         iov.iov_len = size;
3181         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3182         kunmap(page);
3183         return res;
3184 }
3185 EXPORT_SYMBOL(sock_no_sendpage_locked);
3186
3187 /*
3188  *      Default Socket Callbacks
3189  */
3190
3191 static void sock_def_wakeup(struct sock *sk)
3192 {
3193         struct socket_wq *wq;
3194
3195         rcu_read_lock();
3196         wq = rcu_dereference(sk->sk_wq);
3197         if (skwq_has_sleeper(wq))
3198                 wake_up_interruptible_all(&wq->wait);
3199         rcu_read_unlock();
3200 }
3201
3202 static void sock_def_error_report(struct sock *sk)
3203 {
3204         struct socket_wq *wq;
3205
3206         rcu_read_lock();
3207         wq = rcu_dereference(sk->sk_wq);
3208         if (skwq_has_sleeper(wq))
3209                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3210         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3211         rcu_read_unlock();
3212 }
3213
3214 void sock_def_readable(struct sock *sk)
3215 {
3216         struct socket_wq *wq;
3217
3218         rcu_read_lock();
3219         wq = rcu_dereference(sk->sk_wq);
3220         if (skwq_has_sleeper(wq))
3221                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3222                                                 EPOLLRDNORM | EPOLLRDBAND);
3223         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3224         rcu_read_unlock();
3225 }
3226
3227 static void sock_def_write_space(struct sock *sk)
3228 {
3229         struct socket_wq *wq;
3230
3231         rcu_read_lock();
3232
3233         /* Do not wake up a writer until he can make "significant"
3234          * progress.  --DaveM
3235          */
3236         if (sock_writeable(sk)) {
3237                 wq = rcu_dereference(sk->sk_wq);
3238                 if (skwq_has_sleeper(wq))
3239                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3240                                                 EPOLLWRNORM | EPOLLWRBAND);
3241
3242                 /* Should agree with poll, otherwise some programs break */
3243                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3244         }
3245
3246         rcu_read_unlock();
3247 }
3248
3249 /* An optimised version of sock_def_write_space(), should only be called
3250  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3251  * ->sk_wmem_alloc.
3252  */
3253 static void sock_def_write_space_wfree(struct sock *sk)
3254 {
3255         /* Do not wake up a writer until he can make "significant"
3256          * progress.  --DaveM
3257          */
3258         if (sock_writeable(sk)) {
3259                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3260
3261                 /* rely on refcount_sub from sock_wfree() */
3262                 smp_mb__after_atomic();
3263                 if (wq && waitqueue_active(&wq->wait))
3264                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3265                                                 EPOLLWRNORM | EPOLLWRBAND);
3266
3267                 /* Should agree with poll, otherwise some programs break */
3268                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3269         }
3270 }
3271
3272 static void sock_def_destruct(struct sock *sk)
3273 {
3274 }
3275
3276 void sk_send_sigurg(struct sock *sk)
3277 {
3278         if (sk->sk_socket && sk->sk_socket->file)
3279                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3280                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3281 }
3282 EXPORT_SYMBOL(sk_send_sigurg);
3283
3284 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3285                     unsigned long expires)
3286 {
3287         if (!mod_timer(timer, expires))
3288                 sock_hold(sk);
3289 }
3290 EXPORT_SYMBOL(sk_reset_timer);
3291
3292 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3293 {
3294         if (del_timer(timer))
3295                 __sock_put(sk);
3296 }
3297 EXPORT_SYMBOL(sk_stop_timer);
3298
3299 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3300 {
3301         if (del_timer_sync(timer))
3302                 __sock_put(sk);
3303 }
3304 EXPORT_SYMBOL(sk_stop_timer_sync);
3305
3306 void sock_init_data(struct socket *sock, struct sock *sk)
3307 {
3308         sk_init_common(sk);
3309         sk->sk_send_head        =       NULL;
3310
3311         timer_setup(&sk->sk_timer, NULL, 0);
3312
3313         sk->sk_allocation       =       GFP_KERNEL;
3314         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3315         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3316         sk->sk_state            =       TCP_CLOSE;
3317         sk_set_socket(sk, sock);
3318
3319         sock_set_flag(sk, SOCK_ZAPPED);
3320
3321         if (sock) {
3322                 sk->sk_type     =       sock->type;
3323                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3324                 sock->sk        =       sk;
3325                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
3326         } else {
3327                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3328                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
3329         }
3330
3331         rwlock_init(&sk->sk_callback_lock);
3332         if (sk->sk_kern_sock)
3333                 lockdep_set_class_and_name(
3334                         &sk->sk_callback_lock,
3335                         af_kern_callback_keys + sk->sk_family,
3336                         af_family_kern_clock_key_strings[sk->sk_family]);
3337         else
3338                 lockdep_set_class_and_name(
3339                         &sk->sk_callback_lock,
3340                         af_callback_keys + sk->sk_family,
3341                         af_family_clock_key_strings[sk->sk_family]);
3342
3343         sk->sk_state_change     =       sock_def_wakeup;
3344         sk->sk_data_ready       =       sock_def_readable;
3345         sk->sk_write_space      =       sock_def_write_space;
3346         sk->sk_error_report     =       sock_def_error_report;
3347         sk->sk_destruct         =       sock_def_destruct;
3348
3349         sk->sk_frag.page        =       NULL;
3350         sk->sk_frag.offset      =       0;
3351         sk->sk_peek_off         =       -1;
3352
3353         sk->sk_peer_pid         =       NULL;
3354         sk->sk_peer_cred        =       NULL;
3355         spin_lock_init(&sk->sk_peer_lock);
3356
3357         sk->sk_write_pending    =       0;
3358         sk->sk_rcvlowat         =       1;
3359         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3360         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3361
3362         sk->sk_stamp = SK_DEFAULT_STAMP;
3363 #if BITS_PER_LONG==32
3364         seqlock_init(&sk->sk_stamp_seq);
3365 #endif
3366         atomic_set(&sk->sk_zckey, 0);
3367
3368 #ifdef CONFIG_NET_RX_BUSY_POLL
3369         sk->sk_napi_id          =       0;
3370         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3371 #endif
3372
3373         sk->sk_max_pacing_rate = ~0UL;
3374         sk->sk_pacing_rate = ~0UL;
3375         WRITE_ONCE(sk->sk_pacing_shift, 10);
3376         sk->sk_incoming_cpu = -1;
3377         sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
3378
3379         sk_rx_queue_clear(sk);
3380         /*
3381          * Before updating sk_refcnt, we must commit prior changes to memory
3382          * (Documentation/RCU/rculist_nulls.rst for details)
3383          */
3384         smp_wmb();
3385         refcount_set(&sk->sk_refcnt, 1);
3386         atomic_set(&sk->sk_drops, 0);
3387 }
3388 EXPORT_SYMBOL(sock_init_data);
3389
3390 void lock_sock_nested(struct sock *sk, int subclass)
3391 {
3392         /* The sk_lock has mutex_lock() semantics here. */
3393         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3394
3395         might_sleep();
3396         spin_lock_bh(&sk->sk_lock.slock);
3397         if (sock_owned_by_user_nocheck(sk))
3398                 __lock_sock(sk);
3399         sk->sk_lock.owned = 1;
3400         spin_unlock_bh(&sk->sk_lock.slock);
3401 }
3402 EXPORT_SYMBOL(lock_sock_nested);
3403
3404 void release_sock(struct sock *sk)
3405 {
3406         spin_lock_bh(&sk->sk_lock.slock);
3407         if (sk->sk_backlog.tail)
3408                 __release_sock(sk);
3409
3410         /* Warning : release_cb() might need to release sk ownership,
3411          * ie call sock_release_ownership(sk) before us.
3412          */
3413         if (sk->sk_prot->release_cb)
3414                 sk->sk_prot->release_cb(sk);
3415
3416         sock_release_ownership(sk);
3417         if (waitqueue_active(&sk->sk_lock.wq))
3418                 wake_up(&sk->sk_lock.wq);
3419         spin_unlock_bh(&sk->sk_lock.slock);
3420 }
3421 EXPORT_SYMBOL(release_sock);
3422
3423 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3424 {
3425         might_sleep();
3426         spin_lock_bh(&sk->sk_lock.slock);
3427
3428         if (!sock_owned_by_user_nocheck(sk)) {
3429                 /*
3430                  * Fast path return with bottom halves disabled and
3431                  * sock::sk_lock.slock held.
3432                  *
3433                  * The 'mutex' is not contended and holding
3434                  * sock::sk_lock.slock prevents all other lockers to
3435                  * proceed so the corresponding unlock_sock_fast() can
3436                  * avoid the slow path of release_sock() completely and
3437                  * just release slock.
3438                  *
3439                  * From a semantical POV this is equivalent to 'acquiring'
3440                  * the 'mutex', hence the corresponding lockdep
3441                  * mutex_release() has to happen in the fast path of
3442                  * unlock_sock_fast().
3443                  */
3444                 return false;
3445         }
3446
3447         __lock_sock(sk);
3448         sk->sk_lock.owned = 1;
3449         __acquire(&sk->sk_lock.slock);
3450         spin_unlock_bh(&sk->sk_lock.slock);
3451         return true;
3452 }
3453 EXPORT_SYMBOL(__lock_sock_fast);
3454
3455 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3456                    bool timeval, bool time32)
3457 {
3458         struct sock *sk = sock->sk;
3459         struct timespec64 ts;
3460
3461         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3462         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3463         if (ts.tv_sec == -1)
3464                 return -ENOENT;
3465         if (ts.tv_sec == 0) {
3466                 ktime_t kt = ktime_get_real();
3467                 sock_write_timestamp(sk, kt);
3468                 ts = ktime_to_timespec64(kt);
3469         }
3470
3471         if (timeval)
3472                 ts.tv_nsec /= 1000;
3473
3474 #ifdef CONFIG_COMPAT_32BIT_TIME
3475         if (time32)
3476                 return put_old_timespec32(&ts, userstamp);
3477 #endif
3478 #ifdef CONFIG_SPARC64
3479         /* beware of padding in sparc64 timeval */
3480         if (timeval && !in_compat_syscall()) {
3481                 struct __kernel_old_timeval __user tv = {
3482                         .tv_sec = ts.tv_sec,
3483                         .tv_usec = ts.tv_nsec,
3484                 };
3485                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3486                         return -EFAULT;
3487                 return 0;
3488         }
3489 #endif
3490         return put_timespec64(&ts, userstamp);
3491 }
3492 EXPORT_SYMBOL(sock_gettstamp);
3493
3494 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3495 {
3496         if (!sock_flag(sk, flag)) {
3497                 unsigned long previous_flags = sk->sk_flags;
3498
3499                 sock_set_flag(sk, flag);
3500                 /*
3501                  * we just set one of the two flags which require net
3502                  * time stamping, but time stamping might have been on
3503                  * already because of the other one
3504                  */
3505                 if (sock_needs_netstamp(sk) &&
3506                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3507                         net_enable_timestamp();
3508         }
3509 }
3510
3511 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3512                        int level, int type)
3513 {
3514         struct sock_exterr_skb *serr;
3515         struct sk_buff *skb;
3516         int copied, err;
3517
3518         err = -EAGAIN;
3519         skb = sock_dequeue_err_skb(sk);
3520         if (skb == NULL)
3521                 goto out;
3522
3523         copied = skb->len;
3524         if (copied > len) {
3525                 msg->msg_flags |= MSG_TRUNC;
3526                 copied = len;
3527         }
3528         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3529         if (err)
3530                 goto out_free_skb;
3531
3532         sock_recv_timestamp(msg, sk, skb);
3533
3534         serr = SKB_EXT_ERR(skb);
3535         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3536
3537         msg->msg_flags |= MSG_ERRQUEUE;
3538         err = copied;
3539
3540 out_free_skb:
3541         kfree_skb(skb);
3542 out:
3543         return err;
3544 }
3545 EXPORT_SYMBOL(sock_recv_errqueue);
3546
3547 /*
3548  *      Get a socket option on an socket.
3549  *
3550  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3551  *      asynchronous errors should be reported by getsockopt. We assume
3552  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3553  */
3554 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3555                            char __user *optval, int __user *optlen)
3556 {
3557         struct sock *sk = sock->sk;
3558
3559         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3560 }
3561 EXPORT_SYMBOL(sock_common_getsockopt);
3562
3563 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3564                         int flags)
3565 {
3566         struct sock *sk = sock->sk;
3567         int addr_len = 0;
3568         int err;
3569
3570         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3571         if (err >= 0)
3572                 msg->msg_namelen = addr_len;
3573         return err;
3574 }
3575 EXPORT_SYMBOL(sock_common_recvmsg);
3576
3577 /*
3578  *      Set socket options on an inet socket.
3579  */
3580 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3581                            sockptr_t optval, unsigned int optlen)
3582 {
3583         struct sock *sk = sock->sk;
3584
3585         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3586 }
3587 EXPORT_SYMBOL(sock_common_setsockopt);
3588
3589 void sk_common_release(struct sock *sk)
3590 {
3591         if (sk->sk_prot->destroy)
3592                 sk->sk_prot->destroy(sk);
3593
3594         /*
3595          * Observation: when sk_common_release is called, processes have
3596          * no access to socket. But net still has.
3597          * Step one, detach it from networking:
3598          *
3599          * A. Remove from hash tables.
3600          */
3601
3602         sk->sk_prot->unhash(sk);
3603
3604         /*
3605          * In this point socket cannot receive new packets, but it is possible
3606          * that some packets are in flight because some CPU runs receiver and
3607          * did hash table lookup before we unhashed socket. They will achieve
3608          * receive queue and will be purged by socket destructor.
3609          *
3610          * Also we still have packets pending on receive queue and probably,
3611          * our own packets waiting in device queues. sock_destroy will drain
3612          * receive queue, but transmitted packets will delay socket destruction
3613          * until the last reference will be released.
3614          */
3615
3616         sock_orphan(sk);
3617
3618         xfrm_sk_free_policy(sk);
3619
3620         sk_refcnt_debug_release(sk);
3621
3622         sock_put(sk);
3623 }
3624 EXPORT_SYMBOL(sk_common_release);
3625
3626 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3627 {
3628         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3629
3630         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3631         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3632         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3633         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3634         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3635         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3636         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3637         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3638         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3639 }
3640
3641 #ifdef CONFIG_PROC_FS
3642 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3643
3644 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3645 {
3646         int cpu, idx = prot->inuse_idx;
3647         int res = 0;
3648
3649         for_each_possible_cpu(cpu)
3650                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3651
3652         return res >= 0 ? res : 0;
3653 }
3654 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3655
3656 int sock_inuse_get(struct net *net)
3657 {
3658         int cpu, res = 0;
3659
3660         for_each_possible_cpu(cpu)
3661                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3662
3663         return res;
3664 }
3665
3666 EXPORT_SYMBOL_GPL(sock_inuse_get);
3667
3668 static int __net_init sock_inuse_init_net(struct net *net)
3669 {
3670         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3671         if (net->core.prot_inuse == NULL)
3672                 return -ENOMEM;
3673         return 0;
3674 }
3675
3676 static void __net_exit sock_inuse_exit_net(struct net *net)
3677 {
3678         free_percpu(net->core.prot_inuse);
3679 }
3680
3681 static struct pernet_operations net_inuse_ops = {
3682         .init = sock_inuse_init_net,
3683         .exit = sock_inuse_exit_net,
3684 };
3685
3686 static __init int net_inuse_init(void)
3687 {
3688         if (register_pernet_subsys(&net_inuse_ops))
3689                 panic("Cannot initialize net inuse counters");
3690
3691         return 0;
3692 }
3693
3694 core_initcall(net_inuse_init);
3695
3696 static int assign_proto_idx(struct proto *prot)
3697 {
3698         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3699
3700         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3701                 pr_err("PROTO_INUSE_NR exhausted\n");
3702                 return -ENOSPC;
3703         }
3704
3705         set_bit(prot->inuse_idx, proto_inuse_idx);
3706         return 0;
3707 }
3708
3709 static void release_proto_idx(struct proto *prot)
3710 {
3711         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3712                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3713 }
3714 #else
3715 static inline int assign_proto_idx(struct proto *prot)
3716 {
3717         return 0;
3718 }
3719
3720 static inline void release_proto_idx(struct proto *prot)
3721 {
3722 }
3723
3724 #endif
3725
3726 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3727 {
3728         if (!twsk_prot)
3729                 return;
3730         kfree(twsk_prot->twsk_slab_name);
3731         twsk_prot->twsk_slab_name = NULL;
3732         kmem_cache_destroy(twsk_prot->twsk_slab);
3733         twsk_prot->twsk_slab = NULL;
3734 }
3735
3736 static int tw_prot_init(const struct proto *prot)
3737 {
3738         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3739
3740         if (!twsk_prot)
3741                 return 0;
3742
3743         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3744                                               prot->name);
3745         if (!twsk_prot->twsk_slab_name)
3746                 return -ENOMEM;
3747
3748         twsk_prot->twsk_slab =
3749                 kmem_cache_create(twsk_prot->twsk_slab_name,
3750                                   twsk_prot->twsk_obj_size, 0,
3751                                   SLAB_ACCOUNT | prot->slab_flags,
3752                                   NULL);
3753         if (!twsk_prot->twsk_slab) {
3754                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3755                         prot->name);
3756                 return -ENOMEM;
3757         }
3758
3759         return 0;
3760 }
3761
3762 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3763 {
3764         if (!rsk_prot)
3765                 return;
3766         kfree(rsk_prot->slab_name);
3767         rsk_prot->slab_name = NULL;
3768         kmem_cache_destroy(rsk_prot->slab);
3769         rsk_prot->slab = NULL;
3770 }
3771
3772 static int req_prot_init(const struct proto *prot)
3773 {
3774         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3775
3776         if (!rsk_prot)
3777                 return 0;
3778
3779         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3780                                         prot->name);
3781         if (!rsk_prot->slab_name)
3782                 return -ENOMEM;
3783
3784         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3785                                            rsk_prot->obj_size, 0,
3786                                            SLAB_ACCOUNT | prot->slab_flags,
3787                                            NULL);
3788
3789         if (!rsk_prot->slab) {
3790                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3791                         prot->name);
3792                 return -ENOMEM;
3793         }
3794         return 0;
3795 }
3796
3797 int proto_register(struct proto *prot, int alloc_slab)
3798 {
3799         int ret = -ENOBUFS;
3800
3801         if (prot->memory_allocated && !prot->sysctl_mem) {
3802                 pr_err("%s: missing sysctl_mem\n", prot->name);
3803                 return -EINVAL;
3804         }
3805         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3806                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3807                 return -EINVAL;
3808         }
3809         if (alloc_slab) {
3810                 prot->slab = kmem_cache_create_usercopy(prot->name,
3811                                         prot->obj_size, 0,
3812                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3813                                         prot->slab_flags,
3814                                         prot->useroffset, prot->usersize,
3815                                         NULL);
3816
3817                 if (prot->slab == NULL) {
3818                         pr_crit("%s: Can't create sock SLAB cache!\n",
3819                                 prot->name);
3820                         goto out;
3821                 }
3822
3823                 if (req_prot_init(prot))
3824                         goto out_free_request_sock_slab;
3825
3826                 if (tw_prot_init(prot))
3827                         goto out_free_timewait_sock_slab;
3828         }
3829
3830         mutex_lock(&proto_list_mutex);
3831         ret = assign_proto_idx(prot);
3832         if (ret) {
3833                 mutex_unlock(&proto_list_mutex);
3834                 goto out_free_timewait_sock_slab;
3835         }
3836         list_add(&prot->node, &proto_list);
3837         mutex_unlock(&proto_list_mutex);
3838         return ret;
3839
3840 out_free_timewait_sock_slab:
3841         if (alloc_slab)
3842                 tw_prot_cleanup(prot->twsk_prot);
3843 out_free_request_sock_slab:
3844         if (alloc_slab) {
3845                 req_prot_cleanup(prot->rsk_prot);
3846
3847                 kmem_cache_destroy(prot->slab);
3848                 prot->slab = NULL;
3849         }
3850 out:
3851         return ret;
3852 }
3853 EXPORT_SYMBOL(proto_register);
3854
3855 void proto_unregister(struct proto *prot)
3856 {
3857         mutex_lock(&proto_list_mutex);
3858         release_proto_idx(prot);
3859         list_del(&prot->node);
3860         mutex_unlock(&proto_list_mutex);
3861
3862         kmem_cache_destroy(prot->slab);
3863         prot->slab = NULL;
3864
3865         req_prot_cleanup(prot->rsk_prot);
3866         tw_prot_cleanup(prot->twsk_prot);
3867 }
3868 EXPORT_SYMBOL(proto_unregister);
3869
3870 int sock_load_diag_module(int family, int protocol)
3871 {
3872         if (!protocol) {
3873                 if (!sock_is_registered(family))
3874                         return -ENOENT;
3875
3876                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3877                                       NETLINK_SOCK_DIAG, family);
3878         }
3879
3880 #ifdef CONFIG_INET
3881         if (family == AF_INET &&
3882             protocol != IPPROTO_RAW &&
3883             protocol < MAX_INET_PROTOS &&
3884             !rcu_access_pointer(inet_protos[protocol]))
3885                 return -ENOENT;
3886 #endif
3887
3888         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3889                               NETLINK_SOCK_DIAG, family, protocol);
3890 }
3891 EXPORT_SYMBOL(sock_load_diag_module);
3892
3893 #ifdef CONFIG_PROC_FS
3894 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3895         __acquires(proto_list_mutex)
3896 {
3897         mutex_lock(&proto_list_mutex);
3898         return seq_list_start_head(&proto_list, *pos);
3899 }
3900
3901 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3902 {
3903         return seq_list_next(v, &proto_list, pos);
3904 }
3905
3906 static void proto_seq_stop(struct seq_file *seq, void *v)
3907         __releases(proto_list_mutex)
3908 {
3909         mutex_unlock(&proto_list_mutex);
3910 }
3911
3912 static char proto_method_implemented(const void *method)
3913 {
3914         return method == NULL ? 'n' : 'y';
3915 }
3916 static long sock_prot_memory_allocated(struct proto *proto)
3917 {
3918         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3919 }
3920
3921 static const char *sock_prot_memory_pressure(struct proto *proto)
3922 {
3923         return proto->memory_pressure != NULL ?
3924         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3925 }
3926
3927 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3928 {
3929
3930         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3931                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3932                    proto->name,
3933                    proto->obj_size,
3934                    sock_prot_inuse_get(seq_file_net(seq), proto),
3935                    sock_prot_memory_allocated(proto),
3936                    sock_prot_memory_pressure(proto),
3937                    proto->max_header,
3938                    proto->slab == NULL ? "no" : "yes",
3939                    module_name(proto->owner),
3940                    proto_method_implemented(proto->close),
3941                    proto_method_implemented(proto->connect),
3942                    proto_method_implemented(proto->disconnect),
3943                    proto_method_implemented(proto->accept),
3944                    proto_method_implemented(proto->ioctl),
3945                    proto_method_implemented(proto->init),
3946                    proto_method_implemented(proto->destroy),
3947                    proto_method_implemented(proto->shutdown),
3948                    proto_method_implemented(proto->setsockopt),
3949                    proto_method_implemented(proto->getsockopt),
3950                    proto_method_implemented(proto->sendmsg),
3951                    proto_method_implemented(proto->recvmsg),
3952                    proto_method_implemented(proto->sendpage),
3953                    proto_method_implemented(proto->bind),
3954                    proto_method_implemented(proto->backlog_rcv),
3955                    proto_method_implemented(proto->hash),
3956                    proto_method_implemented(proto->unhash),
3957                    proto_method_implemented(proto->get_port),
3958                    proto_method_implemented(proto->enter_memory_pressure));
3959 }
3960
3961 static int proto_seq_show(struct seq_file *seq, void *v)
3962 {
3963         if (v == &proto_list)
3964                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3965                            "protocol",
3966                            "size",
3967                            "sockets",
3968                            "memory",
3969                            "press",
3970                            "maxhdr",
3971                            "slab",
3972                            "module",
3973                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3974         else
3975                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3976         return 0;
3977 }
3978
3979 static const struct seq_operations proto_seq_ops = {
3980         .start  = proto_seq_start,
3981         .next   = proto_seq_next,
3982         .stop   = proto_seq_stop,
3983         .show   = proto_seq_show,
3984 };
3985
3986 static __net_init int proto_init_net(struct net *net)
3987 {
3988         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3989                         sizeof(struct seq_net_private)))
3990                 return -ENOMEM;
3991
3992         return 0;
3993 }
3994
3995 static __net_exit void proto_exit_net(struct net *net)
3996 {
3997         remove_proc_entry("protocols", net->proc_net);
3998 }
3999
4000
4001 static __net_initdata struct pernet_operations proto_net_ops = {
4002         .init = proto_init_net,
4003         .exit = proto_exit_net,
4004 };
4005
4006 static int __init proto_init(void)
4007 {
4008         return register_pernet_subsys(&proto_net_ops);
4009 }
4010
4011 subsys_initcall(proto_init);
4012
4013 #endif /* PROC_FS */
4014
4015 #ifdef CONFIG_NET_RX_BUSY_POLL
4016 bool sk_busy_loop_end(void *p, unsigned long start_time)
4017 {
4018         struct sock *sk = p;
4019
4020         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4021                sk_busy_loop_timeout(sk, start_time);
4022 }
4023 EXPORT_SYMBOL(sk_busy_loop_end);
4024 #endif /* CONFIG_NET_RX_BUSY_POLL */
4025
4026 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4027 {
4028         if (!sk->sk_prot->bind_add)
4029                 return -EOPNOTSUPP;
4030         return sk->sk_prot->bind_add(sk, addr, addr_len);
4031 }
4032 EXPORT_SYMBOL(sock_bind_add);