net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <asm/unaligned.h>
  95 #include <linux/capability.h>
  96 #include <linux/errno.h>
  97 #include <linux/errqueue.h>
  98 #include <linux/types.h>
  99 #include <linux/socket.h>
 100 #include <linux/in.h>
 101 #include <linux/kernel.h>
 102 #include <linux/module.h>
 103 #include <linux/proc_fs.h>
 104 #include <linux/seq_file.h>
 105 #include <linux/sched.h>
 106 #include <linux/sched/mm.h>
 107 #include <linux/timer.h>
 108 #include <linux/string.h>
 109 #include <linux/sockios.h>
 110 #include <linux/net.h>
 111 #include <linux/mm.h>
 112 #include <linux/slab.h>
 113 #include <linux/interrupt.h>
 114 #include <linux/poll.h>
 115 #include <linux/tcp.h>
 116 #include <linux/init.h>
 117 #include <linux/highmem.h>
 118 #include <linux/user_namespace.h>
 119 #include <linux/static_key.h>
 120 #include <linux/memcontrol.h>
 121 #include <linux/prefetch.h>
 122
 123 #include <linux/uaccess.h>
 124
 125 #include <linux/netdevice.h>
 126 #include <net/protocol.h>
 127 #include <linux/skbuff.h>
 128 #include <net/net_namespace.h>
 129 #include <net/request_sock.h>
 130 #include <net/sock.h>
 131 #include <linux/net_tstamp.h>
 132 #include <net/xfrm.h>
 133 #include <linux/ipsec.h>
 134 #include <net/cls_cgroup.h>
 135 #include <net/netprio_cgroup.h>
 136 #include <linux/sock_diag.h>
 137
 138 #include <linux/filter.h>
 139 #include <net/sock_reuseport.h>
 140 #include <net/bpf_sk_storage.h>
 141
 142 #include <trace/events/sock.h>
 143
 144 #include <net/tcp.h>
 145 #include <net/busy_poll.h>
 146
 147 static DEFINE_MUTEX(proto_list_mutex);
 148 static LIST_HEAD(proto_list);
 149
 150 static void sock_inuse_add(struct net *net, int val);
 151
 152 /**
 153  * sk_ns_capable - General socket capability test
 154  * @sk: Socket to use a capability on or through
 155  * @user_ns: The user namespace of the capability to use
 156  * @cap: The capability to use
 157  *
 158  * Test to see if the opener of the socket had when the socket was
 159  * created and the current process has the capability @cap in the user
 160  * namespace @user_ns.
 161  */
 162 bool sk_ns_capable(const struct sock *sk,
 163                    struct user_namespace *user_ns, int cap)
 164 {
 165         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 166                 ns_capable(user_ns, cap);
 167 }
 168 EXPORT_SYMBOL(sk_ns_capable);
 169
 170 /**
 171  * sk_capable - Socket global capability test
 172  * @sk: Socket to use a capability on or through
 173  * @cap: The global capability to use
 174  *
 175  * Test to see if the opener of the socket had when the socket was
 176  * created and the current process has the capability @cap in all user
 177  * namespaces.
 178  */
 179 bool sk_capable(const struct sock *sk, int cap)
 180 {
 181         return sk_ns_capable(sk, &init_user_ns, cap);
 182 }
 183 EXPORT_SYMBOL(sk_capable);
 184
 185 /**
 186  * sk_net_capable - Network namespace socket capability test
 187  * @sk: Socket to use a capability on or through
 188  * @cap: The capability to use
 189  *
 190  * Test to see if the opener of the socket had when the socket was created
 191  * and the current process has the capability @cap over the network namespace
 192  * the socket is a member of.
 193  */
 194 bool sk_net_capable(const struct sock *sk, int cap)
 195 {
 196         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 197 }
 198 EXPORT_SYMBOL(sk_net_capable);
 199
 200 /*
 201  * Each address family might have different locking rules, so we have
 202  * one slock key per address family and separate keys for internal and
 203  * userspace sockets.
 204  */
 205 static struct lock_class_key af_family_keys[AF_MAX];
 206 static struct lock_class_key af_family_kern_keys[AF_MAX];
 207 static struct lock_class_key af_family_slock_keys[AF_MAX];
 208 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 209
 210 /*
 211  * Make lock validator output more readable. (we pre-construct these
 212  * strings build-time, so that runtime initialization of socket
 213  * locks is fast):
 214  */
 215
 216 #define _sock_locks(x)                                            \
 217   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 218   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 219   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 220   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 221   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 222   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 223   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 224   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 225   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 226   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 227   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 228   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 229   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 230   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 231   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 232   x "AF_MAX"
 233
 234 static const char *const af_family_key_strings[AF_MAX+1] = {
 235         _sock_locks("sk_lock-")
 236 };
 237 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 238         _sock_locks("slock-")
 239 };
 240 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 241         _sock_locks("clock-")
 242 };
 243
 244 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 245         _sock_locks("k-sk_lock-")
 246 };
 247 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 248         _sock_locks("k-slock-")
 249 };
 250 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 251         _sock_locks("k-clock-")
 252 };
 253 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 254         _sock_locks("rlock-")
 255 };
 256 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 257         _sock_locks("wlock-")
 258 };
 259 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 260         _sock_locks("elock-")
 261 };
 262
 263 /*
 264  * sk_callback_lock and sk queues locking rules are per-address-family,
 265  * so split the lock classes by using a per-AF key:
 266  */
 267 static struct lock_class_key af_callback_keys[AF_MAX];
 268 static struct lock_class_key af_rlock_keys[AF_MAX];
 269 static struct lock_class_key af_wlock_keys[AF_MAX];
 270 static struct lock_class_key af_elock_keys[AF_MAX];
 271 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 272
 273 /* Run time adjustable parameters. */
 274 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 275 EXPORT_SYMBOL(sysctl_wmem_max);
 276 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 277 EXPORT_SYMBOL(sysctl_rmem_max);
 278 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 279 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 280
 281 /* Maximal space eaten by iovec or ancillary data plus some space */
 282 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 283 EXPORT_SYMBOL(sysctl_optmem_max);
 284
 285 int sysctl_tstamp_allow_data __read_mostly = 1;
 286
 287 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 288 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 289
 290 /**
 291  * sk_set_memalloc - sets %SOCK_MEMALLOC
 292  * @sk: socket to set it on
 293  *
 294  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 295  * It's the responsibility of the admin to adjust min_free_kbytes
 296  * to meet the requirements
 297  */
 298 void sk_set_memalloc(struct sock *sk)
 299 {
 300         sock_set_flag(sk, SOCK_MEMALLOC);
 301         sk->sk_allocation |= __GFP_MEMALLOC;
 302         static_branch_inc(&memalloc_socks_key);
 303 }
 304 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 305
 306 void sk_clear_memalloc(struct sock *sk)
 307 {
 308         sock_reset_flag(sk, SOCK_MEMALLOC);
 309         sk->sk_allocation &= ~__GFP_MEMALLOC;
 310         static_branch_dec(&memalloc_socks_key);
 311
 312         /*
 313          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 314          * progress of swapping. SOCK_MEMALLOC may be cleared while
 315          * it has rmem allocations due to the last swapfile being deactivated
 316          * but there is a risk that the socket is unusable due to exceeding
 317          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 318          */
 319         sk_mem_reclaim(sk);
 320 }
 321 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 322
 323 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 324 {
 325         int ret;
 326         unsigned int noreclaim_flag;
 327
 328         /* these should have been dropped before queueing */
 329         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 330
 331         noreclaim_flag = memalloc_noreclaim_save();
 332         ret = sk->sk_backlog_rcv(sk, skb);
 333         memalloc_noreclaim_restore(noreclaim_flag);
 334
 335         return ret;
 336 }
 337 EXPORT_SYMBOL(__sk_backlog_rcv);
 338
 339 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 340 {
 341         struct __kernel_sock_timeval tv;
 342         int size;
 343
 344         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 345                 tv.tv_sec = 0;
 346                 tv.tv_usec = 0;
 347         } else {
 348                 tv.tv_sec = timeo / HZ;
 349                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 350         }
 351
 352         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 353                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 354                 *(struct old_timeval32 *)optval = tv32;
 355                 return sizeof(tv32);
 356         }
 357
 358         if (old_timeval) {
 359                 struct __kernel_old_timeval old_tv;
 360                 old_tv.tv_sec = tv.tv_sec;
 361                 old_tv.tv_usec = tv.tv_usec;
 362                 *(struct __kernel_old_timeval *)optval = old_tv;
 363                 size = sizeof(old_tv);
 364         } else {
 365                 *(struct __kernel_sock_timeval *)optval = tv;
 366                 size = sizeof(tv);
 367         }
 368
 369         return size;
 370 }
 371
 372 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
 373 {
 374         struct __kernel_sock_timeval tv;
 375
 376         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 377                 struct old_timeval32 tv32;
 378
 379                 if (optlen < sizeof(tv32))
 380                         return -EINVAL;
 381
 382                 if (copy_from_user(&tv32, optval, sizeof(tv32)))
 383                         return -EFAULT;
 384                 tv.tv_sec = tv32.tv_sec;
 385                 tv.tv_usec = tv32.tv_usec;
 386         } else if (old_timeval) {
 387                 struct __kernel_old_timeval old_tv;
 388
 389                 if (optlen < sizeof(old_tv))
 390                         return -EINVAL;
 391                 if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
 392                         return -EFAULT;
 393                 tv.tv_sec = old_tv.tv_sec;
 394                 tv.tv_usec = old_tv.tv_usec;
 395         } else {
 396                 if (optlen < sizeof(tv))
 397                         return -EINVAL;
 398                 if (copy_from_user(&tv, optval, sizeof(tv)))
 399                         return -EFAULT;
 400         }
 401         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 402                 return -EDOM;
 403
 404         if (tv.tv_sec < 0) {
 405                 static int warned __read_mostly;
 406
 407                 *timeo_p = 0;
 408                 if (warned < 10 && net_ratelimit()) {
 409                         warned++;
 410                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 411                                 __func__, current->comm, task_pid_nr(current));
 412                 }
 413                 return 0;
 414         }
 415         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 416         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 417                 return 0;
 418         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 419                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 420         return 0;
 421 }
 422
 423 static void sock_warn_obsolete_bsdism(const char *name)
 424 {
 425         static int warned;
 426         static char warncomm[TASK_COMM_LEN];
 427         if (strcmp(warncomm, current->comm) && warned < 5) {
 428                 strcpy(warncomm,  current->comm);
 429                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 430                         warncomm, name);
 431                 warned++;
 432         }
 433 }
 434
 435 static bool sock_needs_netstamp(const struct sock *sk)
 436 {
 437         switch (sk->sk_family) {
 438         case AF_UNSPEC:
 439         case AF_UNIX:
 440                 return false;
 441         default:
 442                 return true;
 443         }
 444 }
 445
 446 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 447 {
 448         if (sk->sk_flags & flags) {
 449                 sk->sk_flags &= ~flags;
 450                 if (sock_needs_netstamp(sk) &&
 451                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 452                         net_disable_timestamp();
 453         }
 454 }
 455
 456
 457 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 458 {
 459         unsigned long flags;
 460         struct sk_buff_head *list = &sk->sk_receive_queue;
 461
 462         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 463                 atomic_inc(&sk->sk_drops);
 464                 trace_sock_rcvqueue_full(sk, skb);
 465                 return -ENOMEM;
 466         }
 467
 468         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 469                 atomic_inc(&sk->sk_drops);
 470                 return -ENOBUFS;
 471         }
 472
 473         skb->dev = NULL;
 474         skb_set_owner_r(skb, sk);
 475
 476         /* we escape from rcu protected region, make sure we dont leak
 477          * a norefcounted dst
 478          */
 479         skb_dst_force(skb);
 480
 481         spin_lock_irqsave(&list->lock, flags);
 482         sock_skb_set_dropcount(sk, skb);
 483         __skb_queue_tail(list, skb);
 484         spin_unlock_irqrestore(&list->lock, flags);
 485
 486         if (!sock_flag(sk, SOCK_DEAD))
 487                 sk->sk_data_ready(sk);
 488         return 0;
 489 }
 490 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 491
 492 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 493 {
 494         int err;
 495
 496         err = sk_filter(sk, skb);
 497         if (err)
 498                 return err;
 499
 500         return __sock_queue_rcv_skb(sk, skb);
 501 }
 502 EXPORT_SYMBOL(sock_queue_rcv_skb);
 503
 504 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 505                      const int nested, unsigned int trim_cap, bool refcounted)
 506 {
 507         int rc = NET_RX_SUCCESS;
 508
 509         if (sk_filter_trim_cap(sk, skb, trim_cap))
 510                 goto discard_and_relse;
 511
 512         skb->dev = NULL;
 513
 514         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 515                 atomic_inc(&sk->sk_drops);
 516                 goto discard_and_relse;
 517         }
 518         if (nested)
 519                 bh_lock_sock_nested(sk);
 520         else
 521                 bh_lock_sock(sk);
 522         if (!sock_owned_by_user(sk)) {
 523                 /*
 524                  * trylock + unlock semantics:
 525                  */
 526                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 527
 528                 rc = sk_backlog_rcv(sk, skb);
 529
 530                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 531         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 532                 bh_unlock_sock(sk);
 533                 atomic_inc(&sk->sk_drops);
 534                 goto discard_and_relse;
 535         }
 536
 537         bh_unlock_sock(sk);
 538 out:
 539         if (refcounted)
 540                 sock_put(sk);
 541         return rc;
 542 discard_and_relse:
 543         kfree_skb(skb);
 544         goto out;
 545 }
 546 EXPORT_SYMBOL(__sk_receive_skb);
 547
 548 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 549 {
 550         struct dst_entry *dst = __sk_dst_get(sk);
 551
 552         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 553                 sk_tx_queue_clear(sk);
 554                 sk->sk_dst_pending_confirm = 0;
 555                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 556                 dst_release(dst);
 557                 return NULL;
 558         }
 559
 560         return dst;
 561 }
 562 EXPORT_SYMBOL(__sk_dst_check);
 563
 564 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 565 {
 566         struct dst_entry *dst = sk_dst_get(sk);
 567
 568         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 569                 sk_dst_reset(sk);
 570                 dst_release(dst);
 571                 return NULL;
 572         }
 573
 574         return dst;
 575 }
 576 EXPORT_SYMBOL(sk_dst_check);
 577
 578 static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
 579 {
 580         int ret = -ENOPROTOOPT;
 581 #ifdef CONFIG_NETDEVICES
 582         struct net *net = sock_net(sk);
 583
 584         /* Sorry... */
 585         ret = -EPERM;
 586         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 587                 goto out;
 588
 589         ret = -EINVAL;
 590         if (ifindex < 0)
 591                 goto out;
 592
 593         sk->sk_bound_dev_if = ifindex;
 594         if (sk->sk_prot->rehash)
 595                 sk->sk_prot->rehash(sk);
 596         sk_dst_reset(sk);
 597
 598         ret = 0;
 599
 600 out:
 601 #endif
 602
 603         return ret;
 604 }
 605
 606 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 607                                 int optlen)
 608 {
 609         int ret = -ENOPROTOOPT;
 610 #ifdef CONFIG_NETDEVICES
 611         struct net *net = sock_net(sk);
 612         char devname[IFNAMSIZ];
 613         int index;
 614
 615         ret = -EINVAL;
 616         if (optlen < 0)
 617                 goto out;
 618
 619         /* Bind this socket to a particular device like "eth0",
 620          * as specified in the passed interface name. If the
 621          * name is "" or the option length is zero the socket
 622          * is not bound.
 623          */
 624         if (optlen > IFNAMSIZ - 1)
 625                 optlen = IFNAMSIZ - 1;
 626         memset(devname, 0, sizeof(devname));
 627
 628         ret = -EFAULT;
 629         if (copy_from_user(devname, optval, optlen))
 630                 goto out;
 631
 632         index = 0;
 633         if (devname[0] != '\0') {
 634                 struct net_device *dev;
 635
 636                 rcu_read_lock();
 637                 dev = dev_get_by_name_rcu(net, devname);
 638                 if (dev)
 639                         index = dev->ifindex;
 640                 rcu_read_unlock();
 641                 ret = -ENODEV;
 642                 if (!dev)
 643                         goto out;
 644         }
 645
 646         lock_sock(sk);
 647         ret = sock_setbindtodevice_locked(sk, index);
 648         release_sock(sk);
 649
 650 out:
 651 #endif
 652
 653         return ret;
 654 }
 655
 656 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 657                                 int __user *optlen, int len)
 658 {
 659         int ret = -ENOPROTOOPT;
 660 #ifdef CONFIG_NETDEVICES
 661         struct net *net = sock_net(sk);
 662         char devname[IFNAMSIZ];
 663
 664         if (sk->sk_bound_dev_if == 0) {
 665                 len = 0;
 666                 goto zero;
 667         }
 668
 669         ret = -EINVAL;
 670         if (len < IFNAMSIZ)
 671                 goto out;
 672
 673         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 674         if (ret)
 675                 goto out;
 676
 677         len = strlen(devname) + 1;
 678
 679         ret = -EFAULT;
 680         if (copy_to_user(optval, devname, len))
 681                 goto out;
 682
 683 zero:
 684         ret = -EFAULT;
 685         if (put_user(len, optlen))
 686                 goto out;
 687
 688         ret = 0;
 689
 690 out:
 691 #endif
 692
 693         return ret;
 694 }
 695
 696 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 697 {
 698         if (valbool)
 699                 sock_set_flag(sk, bit);
 700         else
 701                 sock_reset_flag(sk, bit);
 702 }
 703
 704 bool sk_mc_loop(struct sock *sk)
 705 {
 706         if (dev_recursion_level())
 707                 return false;
 708         if (!sk)
 709                 return true;
 710         switch (sk->sk_family) {
 711         case AF_INET:
 712                 return inet_sk(sk)->mc_loop;
 713 #if IS_ENABLED(CONFIG_IPV6)
 714         case AF_INET6:
 715                 return inet6_sk(sk)->mc_loop;
 716 #endif
 717         }
 718         WARN_ON(1);
 719         return true;
 720 }
 721 EXPORT_SYMBOL(sk_mc_loop);
 722
 723 /*
 724  *      This is meant for all protocols to use and covers goings on
 725  *      at the socket level. Everything here is generic.
 726  */
 727
 728 int sock_setsockopt(struct socket *sock, int level, int optname,
 729                     char __user *optval, unsigned int optlen)
 730 {
 731         struct sock_txtime sk_txtime;
 732         struct sock *sk = sock->sk;
 733         int val;
 734         int valbool;
 735         struct linger ling;
 736         int ret = 0;
 737
 738         /*
 739          *      Options without arguments
 740          */
 741
 742         if (optname == SO_BINDTODEVICE)
 743                 return sock_setbindtodevice(sk, optval, optlen);
 744
 745         if (optlen < sizeof(int))
 746                 return -EINVAL;
 747
 748         if (get_user(val, (int __user *)optval))
 749                 return -EFAULT;
 750
 751         valbool = val ? 1 : 0;
 752
 753         lock_sock(sk);
 754
 755         switch (optname) {
 756         case SO_DEBUG:
 757                 if (val && !capable(CAP_NET_ADMIN))
 758                         ret = -EACCES;
 759                 else
 760                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 761                 break;
 762         case SO_REUSEADDR:
 763                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 764                 break;
 765         case SO_REUSEPORT:
 766                 sk->sk_reuseport = valbool;
 767                 break;
 768         case SO_TYPE:
 769         case SO_PROTOCOL:
 770         case SO_DOMAIN:
 771         case SO_ERROR:
 772                 ret = -ENOPROTOOPT;
 773                 break;
 774         case SO_DONTROUTE:
 775                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 776                 sk_dst_reset(sk);
 777                 break;
 778         case SO_BROADCAST:
 779                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 780                 break;
 781         case SO_SNDBUF:
 782                 /* Don't error on this BSD doesn't and if you think
 783                  * about it this is right. Otherwise apps have to
 784                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 785                  * are treated in BSD as hints
 786                  */
 787                 val = min_t(u32, val, sysctl_wmem_max);
 788 set_sndbuf:
 789                 /* Ensure val * 2 fits into an int, to prevent max_t()
 790                  * from treating it as a negative value.
 791                  */
 792                 val = min_t(int, val, INT_MAX / 2);
 793                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 794                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 795                 /* Wake up sending tasks if we upped the value. */
 796                 sk->sk_write_space(sk);
 797                 break;
 798
 799         case SO_SNDBUFFORCE:
 800                 if (!capable(CAP_NET_ADMIN)) {
 801                         ret = -EPERM;
 802                         break;
 803                 }
 804
 805                 /* No negative values (to prevent underflow, as val will be
 806                  * multiplied by 2).
 807                  */
 808                 if (val < 0)
 809                         val = 0;
 810                 goto set_sndbuf;
 811
 812         case SO_RCVBUF:
 813                 /* Don't error on this BSD doesn't and if you think
 814                  * about it this is right. Otherwise apps have to
 815                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 816                  * are treated in BSD as hints
 817                  */
 818                 val = min_t(u32, val, sysctl_rmem_max);
 819 set_rcvbuf:
 820                 /* Ensure val * 2 fits into an int, to prevent max_t()
 821                  * from treating it as a negative value.
 822                  */
 823                 val = min_t(int, val, INT_MAX / 2);
 824                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 825                 /*
 826                  * We double it on the way in to account for
 827                  * "struct sk_buff" etc. overhead.   Applications
 828                  * assume that the SO_RCVBUF setting they make will
 829                  * allow that much actual data to be received on that
 830                  * socket.
 831                  *
 832                  * Applications are unaware that "struct sk_buff" and
 833                  * other overheads allocate from the receive buffer
 834                  * during socket buffer allocation.
 835                  *
 836                  * And after considering the possible alternatives,
 837                  * returning the value we actually used in getsockopt
 838                  * is the most desirable behavior.
 839                  */
 840                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 841                 break;
 842
 843         case SO_RCVBUFFORCE:
 844                 if (!capable(CAP_NET_ADMIN)) {
 845                         ret = -EPERM;
 846                         break;
 847                 }
 848
 849                 /* No negative values (to prevent underflow, as val will be
 850                  * multiplied by 2).
 851                  */
 852                 if (val < 0)
 853                         val = 0;
 854                 goto set_rcvbuf;
 855
 856         case SO_KEEPALIVE:
 857                 if (sk->sk_prot->keepalive)
 858                         sk->sk_prot->keepalive(sk, valbool);
 859                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 860                 break;
 861
 862         case SO_OOBINLINE:
 863                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 864                 break;
 865
 866         case SO_NO_CHECK:
 867                 sk->sk_no_check_tx = valbool;
 868                 break;
 869
 870         case SO_PRIORITY:
 871                 if ((val >= 0 && val <= 6) ||
 872                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 873                         sk->sk_priority = val;
 874                 else
 875                         ret = -EPERM;
 876                 break;
 877
 878         case SO_LINGER:
 879                 if (optlen < sizeof(ling)) {
 880                         ret = -EINVAL;  /* 1003.1g */
 881                         break;
 882                 }
 883                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 884                         ret = -EFAULT;
 885                         break;
 886                 }
 887                 if (!ling.l_onoff)
 888                         sock_reset_flag(sk, SOCK_LINGER);
 889                 else {
 890 #if (BITS_PER_LONG == 32)
 891                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 892                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 893                         else
 894 #endif
 895                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 896                         sock_set_flag(sk, SOCK_LINGER);
 897                 }
 898                 break;
 899
 900         case SO_BSDCOMPAT:
 901                 sock_warn_obsolete_bsdism("setsockopt");
 902                 break;
 903
 904         case SO_PASSCRED:
 905                 if (valbool)
 906                         set_bit(SOCK_PASSCRED, &sock->flags);
 907                 else
 908                         clear_bit(SOCK_PASSCRED, &sock->flags);
 909                 break;
 910
 911         case SO_TIMESTAMP_OLD:
 912         case SO_TIMESTAMP_NEW:
 913         case SO_TIMESTAMPNS_OLD:
 914         case SO_TIMESTAMPNS_NEW:
 915                 if (valbool)  {
 916                         if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
 917                                 sock_set_flag(sk, SOCK_TSTAMP_NEW);
 918                         else
 919                                 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 920
 921                         if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
 922                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 923                         else
 924                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 925                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 926                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 927                 } else {
 928                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 929                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 930                         sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 931                 }
 932                 break;
 933
 934         case SO_TIMESTAMPING_NEW:
 935                 sock_set_flag(sk, SOCK_TSTAMP_NEW);
 936                 /* fall through */
 937         case SO_TIMESTAMPING_OLD:
 938                 if (val & ~SOF_TIMESTAMPING_MASK) {
 939                         ret = -EINVAL;
 940                         break;
 941                 }
 942
 943                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 944                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 945                         if (sk->sk_protocol == IPPROTO_TCP &&
 946                             sk->sk_type == SOCK_STREAM) {
 947                                 if ((1 << sk->sk_state) &
 948                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 949                                         ret = -EINVAL;
 950                                         break;
 951                                 }
 952                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 953                         } else {
 954                                 sk->sk_tskey = 0;
 955                         }
 956                 }
 957
 958                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 959                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 960                         ret = -EINVAL;
 961                         break;
 962                 }
 963
 964                 sk->sk_tsflags = val;
 965                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 966                         sock_enable_timestamp(sk,
 967                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 968                 else {
 969                         if (optname == SO_TIMESTAMPING_NEW)
 970                                 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 971
 972                         sock_disable_timestamp(sk,
 973                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 974                 }
 975                 break;
 976
 977         case SO_RCVLOWAT:
 978                 if (val < 0)
 979                         val = INT_MAX;
 980                 if (sock->ops->set_rcvlowat)
 981                         ret = sock->ops->set_rcvlowat(sk, val);
 982                 else
 983                         sk->sk_rcvlowat = val ? : 1;
 984                 break;
 985
 986         case SO_RCVTIMEO_OLD:
 987         case SO_RCVTIMEO_NEW:
 988                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
 989                 break;
 990
 991         case SO_SNDTIMEO_OLD:
 992         case SO_SNDTIMEO_NEW:
 993                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
 994                 break;
 995
 996         case SO_ATTACH_FILTER:
 997                 ret = -EINVAL;
 998                 if (optlen == sizeof(struct sock_fprog)) {
 999                         struct sock_fprog fprog;
1000
1001                         ret = -EFAULT;
1002                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
1003                                 break;
1004
1005                         ret = sk_attach_filter(&fprog, sk);
1006                 }
1007                 break;
1008
1009         case SO_ATTACH_BPF:
1010                 ret = -EINVAL;
1011                 if (optlen == sizeof(u32)) {
1012                         u32 ufd;
1013
1014                         ret = -EFAULT;
1015                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
1016                                 break;
1017
1018                         ret = sk_attach_bpf(ufd, sk);
1019                 }
1020                 break;
1021
1022         case SO_ATTACH_REUSEPORT_CBPF:
1023                 ret = -EINVAL;
1024                 if (optlen == sizeof(struct sock_fprog)) {
1025                         struct sock_fprog fprog;
1026
1027                         ret = -EFAULT;
1028                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
1029                                 break;
1030
1031                         ret = sk_reuseport_attach_filter(&fprog, sk);
1032                 }
1033                 break;
1034
1035         case SO_ATTACH_REUSEPORT_EBPF:
1036                 ret = -EINVAL;
1037                 if (optlen == sizeof(u32)) {
1038                         u32 ufd;
1039
1040                         ret = -EFAULT;
1041                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
1042                                 break;
1043
1044                         ret = sk_reuseport_attach_bpf(ufd, sk);
1045                 }
1046                 break;
1047
1048         case SO_DETACH_FILTER:
1049                 ret = sk_detach_filter(sk);
1050                 break;
1051
1052         case SO_LOCK_FILTER:
1053                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1054                         ret = -EPERM;
1055                 else
1056                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1057                 break;
1058
1059         case SO_PASSSEC:
1060                 if (valbool)
1061                         set_bit(SOCK_PASSSEC, &sock->flags);
1062                 else
1063                         clear_bit(SOCK_PASSSEC, &sock->flags);
1064                 break;
1065         case SO_MARK:
1066                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1067                         ret = -EPERM;
1068                 } else if (val != sk->sk_mark) {
1069                         sk->sk_mark = val;
1070                         sk_dst_reset(sk);
1071                 }
1072                 break;
1073
1074         case SO_RXQ_OVFL:
1075                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1076                 break;
1077
1078         case SO_WIFI_STATUS:
1079                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1080                 break;
1081
1082         case SO_PEEK_OFF:
1083                 if (sock->ops->set_peek_off)
1084                         ret = sock->ops->set_peek_off(sk, val);
1085                 else
1086                         ret = -EOPNOTSUPP;
1087                 break;
1088
1089         case SO_NOFCS:
1090                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1091                 break;
1092
1093         case SO_SELECT_ERR_QUEUE:
1094                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1095                 break;
1096
1097 #ifdef CONFIG_NET_RX_BUSY_POLL
1098         case SO_BUSY_POLL:
1099                 /* allow unprivileged users to decrease the value */
1100                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1101                         ret = -EPERM;
1102                 else {
1103                         if (val < 0)
1104                                 ret = -EINVAL;
1105                         else
1106                                 sk->sk_ll_usec = val;
1107                 }
1108                 break;
1109 #endif
1110
1111         case SO_MAX_PACING_RATE:
1112                 {
1113                 unsigned long ulval = (val == ~0U) ? ~0UL : val;
1114
1115                 if (sizeof(ulval) != sizeof(val) &&
1116                     optlen >= sizeof(ulval) &&
1117                     get_user(ulval, (unsigned long __user *)optval)) {
1118                         ret = -EFAULT;
1119                         break;
1120                 }
1121                 if (ulval != ~0UL)
1122                         cmpxchg(&sk->sk_pacing_status,
1123                                 SK_PACING_NONE,
1124                                 SK_PACING_NEEDED);
1125                 sk->sk_max_pacing_rate = ulval;
1126                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1127                 break;
1128                 }
1129         case SO_INCOMING_CPU:
1130                 sk->sk_incoming_cpu = val;
1131                 break;
1132
1133         case SO_CNX_ADVICE:
1134                 if (val == 1)
1135                         dst_negative_advice(sk);
1136                 break;
1137
1138         case SO_ZEROCOPY:
1139                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1140                         if (!((sk->sk_type == SOCK_STREAM &&
1141                                sk->sk_protocol == IPPROTO_TCP) ||
1142                               (sk->sk_type == SOCK_DGRAM &&
1143                                sk->sk_protocol == IPPROTO_UDP)))
1144                                 ret = -ENOTSUPP;
1145                 } else if (sk->sk_family != PF_RDS) {
1146                         ret = -ENOTSUPP;
1147                 }
1148                 if (!ret) {
1149                         if (val < 0 || val > 1)
1150                                 ret = -EINVAL;
1151                         else
1152                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1153                 }
1154                 break;
1155
1156         case SO_TXTIME:
1157                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1158                         ret = -EPERM;
1159                 } else if (optlen != sizeof(struct sock_txtime)) {
1160                         ret = -EINVAL;
1161                 } else if (copy_from_user(&sk_txtime, optval,
1162                            sizeof(struct sock_txtime))) {
1163                         ret = -EFAULT;
1164                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1165                         ret = -EINVAL;
1166                 } else {
1167                         sock_valbool_flag(sk, SOCK_TXTIME, true);
1168                         sk->sk_clockid = sk_txtime.clockid;
1169                         sk->sk_txtime_deadline_mode =
1170                                 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1171                         sk->sk_txtime_report_errors =
1172                                 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1173                 }
1174                 break;
1175
1176         case SO_BINDTOIFINDEX:
1177                 ret = sock_setbindtodevice_locked(sk, val);
1178                 break;
1179
1180         default:
1181                 ret = -ENOPROTOOPT;
1182                 break;
1183         }
1184         release_sock(sk);
1185         return ret;
1186 }
1187 EXPORT_SYMBOL(sock_setsockopt);
1188
1189
1190 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1191                           struct ucred *ucred)
1192 {
1193         ucred->pid = pid_vnr(pid);
1194         ucred->uid = ucred->gid = -1;
1195         if (cred) {
1196                 struct user_namespace *current_ns = current_user_ns();
1197
1198                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1199                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1200         }
1201 }
1202
1203 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1204 {
1205         struct user_namespace *user_ns = current_user_ns();
1206         int i;
1207
1208         for (i = 0; i < src->ngroups; i++)
1209                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1210                         return -EFAULT;
1211
1212         return 0;
1213 }
1214
1215 int sock_getsockopt(struct socket *sock, int level, int optname,
1216                     char __user *optval, int __user *optlen)
1217 {
1218         struct sock *sk = sock->sk;
1219
1220         union {
1221                 int val;
1222                 u64 val64;
1223                 unsigned long ulval;
1224                 struct linger ling;
1225                 struct old_timeval32 tm32;
1226                 struct __kernel_old_timeval tm;
1227                 struct  __kernel_sock_timeval stm;
1228                 struct sock_txtime txtime;
1229         } v;
1230
1231         int lv = sizeof(int);
1232         int len;
1233
1234         if (get_user(len, optlen))
1235                 return -EFAULT;
1236         if (len < 0)
1237                 return -EINVAL;
1238
1239         memset(&v, 0, sizeof(v));
1240
1241         switch (optname) {
1242         case SO_DEBUG:
1243                 v.val = sock_flag(sk, SOCK_DBG);
1244                 break;
1245
1246         case SO_DONTROUTE:
1247                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1248                 break;
1249
1250         case SO_BROADCAST:
1251                 v.val = sock_flag(sk, SOCK_BROADCAST);
1252                 break;
1253
1254         case SO_SNDBUF:
1255                 v.val = sk->sk_sndbuf;
1256                 break;
1257
1258         case SO_RCVBUF:
1259                 v.val = sk->sk_rcvbuf;
1260                 break;
1261
1262         case SO_REUSEADDR:
1263                 v.val = sk->sk_reuse;
1264                 break;
1265
1266         case SO_REUSEPORT:
1267                 v.val = sk->sk_reuseport;
1268                 break;
1269
1270         case SO_KEEPALIVE:
1271                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1272                 break;
1273
1274         case SO_TYPE:
1275                 v.val = sk->sk_type;
1276                 break;
1277
1278         case SO_PROTOCOL:
1279                 v.val = sk->sk_protocol;
1280                 break;
1281
1282         case SO_DOMAIN:
1283                 v.val = sk->sk_family;
1284                 break;
1285
1286         case SO_ERROR:
1287                 v.val = -sock_error(sk);
1288                 if (v.val == 0)
1289                         v.val = xchg(&sk->sk_err_soft, 0);
1290                 break;
1291
1292         case SO_OOBINLINE:
1293                 v.val = sock_flag(sk, SOCK_URGINLINE);
1294                 break;
1295
1296         case SO_NO_CHECK:
1297                 v.val = sk->sk_no_check_tx;
1298                 break;
1299
1300         case SO_PRIORITY:
1301                 v.val = sk->sk_priority;
1302                 break;
1303
1304         case SO_LINGER:
1305                 lv              = sizeof(v.ling);
1306                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1307                 v.ling.l_linger = sk->sk_lingertime / HZ;
1308                 break;
1309
1310         case SO_BSDCOMPAT:
1311                 sock_warn_obsolete_bsdism("getsockopt");
1312                 break;
1313
1314         case SO_TIMESTAMP_OLD:
1315                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1316                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1317                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1318                 break;
1319
1320         case SO_TIMESTAMPNS_OLD:
1321                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1322                 break;
1323
1324         case SO_TIMESTAMP_NEW:
1325                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1326                 break;
1327
1328         case SO_TIMESTAMPNS_NEW:
1329                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1330                 break;
1331
1332         case SO_TIMESTAMPING_OLD:
1333                 v.val = sk->sk_tsflags;
1334                 break;
1335
1336         case SO_RCVTIMEO_OLD:
1337         case SO_RCVTIMEO_NEW:
1338                 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1339                 break;
1340
1341         case SO_SNDTIMEO_OLD:
1342         case SO_SNDTIMEO_NEW:
1343                 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1344                 break;
1345
1346         case SO_RCVLOWAT:
1347                 v.val = sk->sk_rcvlowat;
1348                 break;
1349
1350         case SO_SNDLOWAT:
1351                 v.val = 1;
1352                 break;
1353
1354         case SO_PASSCRED:
1355                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1356                 break;
1357
1358         case SO_PEERCRED:
1359         {
1360                 struct ucred peercred;
1361                 if (len > sizeof(peercred))
1362                         len = sizeof(peercred);
1363                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1364                 if (copy_to_user(optval, &peercred, len))
1365                         return -EFAULT;
1366                 goto lenout;
1367         }
1368
1369         case SO_PEERGROUPS:
1370         {
1371                 int ret, n;
1372
1373                 if (!sk->sk_peer_cred)
1374                         return -ENODATA;
1375
1376                 n = sk->sk_peer_cred->group_info->ngroups;
1377                 if (len < n * sizeof(gid_t)) {
1378                         len = n * sizeof(gid_t);
1379                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1380                 }
1381                 len = n * sizeof(gid_t);
1382
1383                 ret = groups_to_user((gid_t __user *)optval,
1384                                      sk->sk_peer_cred->group_info);
1385                 if (ret)
1386                         return ret;
1387                 goto lenout;
1388         }
1389
1390         case SO_PEERNAME:
1391         {
1392                 char address[128];
1393
1394                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1395                 if (lv < 0)
1396                         return -ENOTCONN;
1397                 if (lv < len)
1398                         return -EINVAL;
1399                 if (copy_to_user(optval, address, len))
1400                         return -EFAULT;
1401                 goto lenout;
1402         }
1403
1404         /* Dubious BSD thing... Probably nobody even uses it, but
1405          * the UNIX standard wants it for whatever reason... -DaveM
1406          */
1407         case SO_ACCEPTCONN:
1408                 v.val = sk->sk_state == TCP_LISTEN;
1409                 break;
1410
1411         case SO_PASSSEC:
1412                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1413                 break;
1414
1415         case SO_PEERSEC:
1416                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1417
1418         case SO_MARK:
1419                 v.val = sk->sk_mark;
1420                 break;
1421
1422         case SO_RXQ_OVFL:
1423                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1424                 break;
1425
1426         case SO_WIFI_STATUS:
1427                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1428                 break;
1429
1430         case SO_PEEK_OFF:
1431                 if (!sock->ops->set_peek_off)
1432                         return -EOPNOTSUPP;
1433
1434                 v.val = sk->sk_peek_off;
1435                 break;
1436         case SO_NOFCS:
1437                 v.val = sock_flag(sk, SOCK_NOFCS);
1438                 break;
1439
1440         case SO_BINDTODEVICE:
1441                 return sock_getbindtodevice(sk, optval, optlen, len);
1442
1443         case SO_GET_FILTER:
1444                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1445                 if (len < 0)
1446                         return len;
1447
1448                 goto lenout;
1449
1450         case SO_LOCK_FILTER:
1451                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1452                 break;
1453
1454         case SO_BPF_EXTENSIONS:
1455                 v.val = bpf_tell_extensions();
1456                 break;
1457
1458         case SO_SELECT_ERR_QUEUE:
1459                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1460                 break;
1461
1462 #ifdef CONFIG_NET_RX_BUSY_POLL
1463         case SO_BUSY_POLL:
1464                 v.val = sk->sk_ll_usec;
1465                 break;
1466 #endif
1467
1468         case SO_MAX_PACING_RATE:
1469                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1470                         lv = sizeof(v.ulval);
1471                         v.ulval = sk->sk_max_pacing_rate;
1472                 } else {
1473                         /* 32bit version */
1474                         v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1475                 }
1476                 break;
1477
1478         case SO_INCOMING_CPU:
1479                 v.val = sk->sk_incoming_cpu;
1480                 break;
1481
1482         case SO_MEMINFO:
1483         {
1484                 u32 meminfo[SK_MEMINFO_VARS];
1485
1486                 if (get_user(len, optlen))
1487                         return -EFAULT;
1488
1489                 sk_get_meminfo(sk, meminfo);
1490
1491                 len = min_t(unsigned int, len, sizeof(meminfo));
1492                 if (copy_to_user(optval, &meminfo, len))
1493                         return -EFAULT;
1494
1495                 goto lenout;
1496         }
1497
1498 #ifdef CONFIG_NET_RX_BUSY_POLL
1499         case SO_INCOMING_NAPI_ID:
1500                 v.val = READ_ONCE(sk->sk_napi_id);
1501
1502                 /* aggregate non-NAPI IDs down to 0 */
1503                 if (v.val < MIN_NAPI_ID)
1504                         v.val = 0;
1505
1506                 break;
1507 #endif
1508
1509         case SO_COOKIE:
1510                 lv = sizeof(u64);
1511                 if (len < lv)
1512                         return -EINVAL;
1513                 v.val64 = sock_gen_cookie(sk);
1514                 break;
1515
1516         case SO_ZEROCOPY:
1517                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1518                 break;
1519
1520         case SO_TXTIME:
1521                 lv = sizeof(v.txtime);
1522                 v.txtime.clockid = sk->sk_clockid;
1523                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1524                                   SOF_TXTIME_DEADLINE_MODE : 0;
1525                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1526                                   SOF_TXTIME_REPORT_ERRORS : 0;
1527                 break;
1528
1529         case SO_BINDTOIFINDEX:
1530                 v.val = sk->sk_bound_dev_if;
1531                 break;
1532
1533         default:
1534                 /* We implement the SO_SNDLOWAT etc to not be settable
1535                  * (1003.1g 7).
1536                  */
1537                 return -ENOPROTOOPT;
1538         }
1539
1540         if (len > lv)
1541                 len = lv;
1542         if (copy_to_user(optval, &v, len))
1543                 return -EFAULT;
1544 lenout:
1545         if (put_user(len, optlen))
1546                 return -EFAULT;
1547         return 0;
1548 }
1549
1550 /*
1551  * Initialize an sk_lock.
1552  *
1553  * (We also register the sk_lock with the lock validator.)
1554  */
1555 static inline void sock_lock_init(struct sock *sk)
1556 {
1557         if (sk->sk_kern_sock)
1558                 sock_lock_init_class_and_name(
1559                         sk,
1560                         af_family_kern_slock_key_strings[sk->sk_family],
1561                         af_family_kern_slock_keys + sk->sk_family,
1562                         af_family_kern_key_strings[sk->sk_family],
1563                         af_family_kern_keys + sk->sk_family);
1564         else
1565                 sock_lock_init_class_and_name(
1566                         sk,
1567                         af_family_slock_key_strings[sk->sk_family],
1568                         af_family_slock_keys + sk->sk_family,
1569                         af_family_key_strings[sk->sk_family],
1570                         af_family_keys + sk->sk_family);
1571 }
1572
1573 /*
1574  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1575  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1576  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1577  */
1578 static void sock_copy(struct sock *nsk, const struct sock *osk)
1579 {
1580 #ifdef CONFIG_SECURITY_NETWORK
1581         void *sptr = nsk->sk_security;
1582 #endif
1583         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1584
1585         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1586                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1587
1588 #ifdef CONFIG_SECURITY_NETWORK
1589         nsk->sk_security = sptr;
1590         security_sk_clone(osk, nsk);
1591 #endif
1592 }
1593
1594 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1595                 int family)
1596 {
1597         struct sock *sk;
1598         struct kmem_cache *slab;
1599
1600         slab = prot->slab;
1601         if (slab != NULL) {
1602                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1603                 if (!sk)
1604                         return sk;
1605                 if (priority & __GFP_ZERO)
1606                         sk_prot_clear_nulls(sk, prot->obj_size);
1607         } else
1608                 sk = kmalloc(prot->obj_size, priority);
1609
1610         if (sk != NULL) {
1611                 if (security_sk_alloc(sk, family, priority))
1612                         goto out_free;
1613
1614                 if (!try_module_get(prot->owner))
1615                         goto out_free_sec;
1616                 sk_tx_queue_clear(sk);
1617         }
1618
1619         return sk;
1620
1621 out_free_sec:
1622         security_sk_free(sk);
1623 out_free:
1624         if (slab != NULL)
1625                 kmem_cache_free(slab, sk);
1626         else
1627                 kfree(sk);
1628         return NULL;
1629 }
1630
1631 static void sk_prot_free(struct proto *prot, struct sock *sk)
1632 {
1633         struct kmem_cache *slab;
1634         struct module *owner;
1635
1636         owner = prot->owner;
1637         slab = prot->slab;
1638
1639         cgroup_sk_free(&sk->sk_cgrp_data);
1640         mem_cgroup_sk_free(sk);
1641         security_sk_free(sk);
1642         if (slab != NULL)
1643                 kmem_cache_free(slab, sk);
1644         else
1645                 kfree(sk);
1646         module_put(owner);
1647 }
1648
1649 /**
1650  *      sk_alloc - All socket objects are allocated here
1651  *      @net: the applicable net namespace
1652  *      @family: protocol family
1653  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1654  *      @prot: struct proto associated with this new sock instance
1655  *      @kern: is this to be a kernel socket?
1656  */
1657 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1658                       struct proto *prot, int kern)
1659 {
1660         struct sock *sk;
1661
1662         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1663         if (sk) {
1664                 sk->sk_family = family;
1665                 /*
1666                  * See comment in struct sock definition to understand
1667                  * why we need sk_prot_creator -acme
1668                  */
1669                 sk->sk_prot = sk->sk_prot_creator = prot;
1670                 sk->sk_kern_sock = kern;
1671                 sock_lock_init(sk);
1672                 sk->sk_net_refcnt = kern ? 0 : 1;
1673                 if (likely(sk->sk_net_refcnt)) {
1674                         get_net(net);
1675                         sock_inuse_add(net, 1);
1676                 }
1677
1678                 sock_net_set(sk, net);
1679                 refcount_set(&sk->sk_wmem_alloc, 1);
1680
1681                 mem_cgroup_sk_alloc(sk);
1682                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1683                 sock_update_classid(&sk->sk_cgrp_data);
1684                 sock_update_netprioidx(&sk->sk_cgrp_data);
1685         }
1686
1687         return sk;
1688 }
1689 EXPORT_SYMBOL(sk_alloc);
1690
1691 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1692  * grace period. This is the case for UDP sockets and TCP listeners.
1693  */
1694 static void __sk_destruct(struct rcu_head *head)
1695 {
1696         struct sock *sk = container_of(head, struct sock, sk_rcu);
1697         struct sk_filter *filter;
1698
1699         if (sk->sk_destruct)
1700                 sk->sk_destruct(sk);
1701
1702         filter = rcu_dereference_check(sk->sk_filter,
1703                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1704         if (filter) {
1705                 sk_filter_uncharge(sk, filter);
1706                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1707         }
1708         if (rcu_access_pointer(sk->sk_reuseport_cb))
1709                 reuseport_detach_sock(sk);
1710
1711         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1712
1713 #ifdef CONFIG_BPF_SYSCALL
1714         bpf_sk_storage_free(sk);
1715 #endif
1716
1717         if (atomic_read(&sk->sk_omem_alloc))
1718                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1719                          __func__, atomic_read(&sk->sk_omem_alloc));
1720
1721         if (sk->sk_frag.page) {
1722                 put_page(sk->sk_frag.page);
1723                 sk->sk_frag.page = NULL;
1724         }
1725
1726         if (sk->sk_peer_cred)
1727                 put_cred(sk->sk_peer_cred);
1728         put_pid(sk->sk_peer_pid);
1729         if (likely(sk->sk_net_refcnt))
1730                 put_net(sock_net(sk));
1731         sk_prot_free(sk->sk_prot_creator, sk);
1732 }
1733
1734 void sk_destruct(struct sock *sk)
1735 {
1736         if (sock_flag(sk, SOCK_RCU_FREE))
1737                 call_rcu(&sk->sk_rcu, __sk_destruct);
1738         else
1739                 __sk_destruct(&sk->sk_rcu);
1740 }
1741
1742 static void __sk_free(struct sock *sk)
1743 {
1744         if (likely(sk->sk_net_refcnt))
1745                 sock_inuse_add(sock_net(sk), -1);
1746
1747         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1748                 sock_diag_broadcast_destroy(sk);
1749         else
1750                 sk_destruct(sk);
1751 }
1752
1753 void sk_free(struct sock *sk)
1754 {
1755         /*
1756          * We subtract one from sk_wmem_alloc and can know if
1757          * some packets are still in some tx queue.
1758          * If not null, sock_wfree() will call __sk_free(sk) later
1759          */
1760         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1761                 __sk_free(sk);
1762 }
1763 EXPORT_SYMBOL(sk_free);
1764
1765 static void sk_init_common(struct sock *sk)
1766 {
1767         skb_queue_head_init(&sk->sk_receive_queue);
1768         skb_queue_head_init(&sk->sk_write_queue);
1769         skb_queue_head_init(&sk->sk_error_queue);
1770
1771         rwlock_init(&sk->sk_callback_lock);
1772         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1773                         af_rlock_keys + sk->sk_family,
1774                         af_family_rlock_key_strings[sk->sk_family]);
1775         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1776                         af_wlock_keys + sk->sk_family,
1777                         af_family_wlock_key_strings[sk->sk_family]);
1778         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1779                         af_elock_keys + sk->sk_family,
1780                         af_family_elock_key_strings[sk->sk_family]);
1781         lockdep_set_class_and_name(&sk->sk_callback_lock,
1782                         af_callback_keys + sk->sk_family,
1783                         af_family_clock_key_strings[sk->sk_family]);
1784 }
1785
1786 /**
1787  *      sk_clone_lock - clone a socket, and lock its clone
1788  *      @sk: the socket to clone
1789  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1790  *
1791  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1792  */
1793 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1794 {
1795         struct sock *newsk;
1796         bool is_charged = true;
1797
1798         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1799         if (newsk != NULL) {
1800                 struct sk_filter *filter;
1801
1802                 sock_copy(newsk, sk);
1803
1804                 newsk->sk_prot_creator = sk->sk_prot;
1805
1806                 /* SANITY */
1807                 if (likely(newsk->sk_net_refcnt))
1808                         get_net(sock_net(newsk));
1809                 sk_node_init(&newsk->sk_node);
1810                 sock_lock_init(newsk);
1811                 bh_lock_sock(newsk);
1812                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1813                 newsk->sk_backlog.len = 0;
1814
1815                 atomic_set(&newsk->sk_rmem_alloc, 0);
1816                 /*
1817                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1818                  */
1819                 refcount_set(&newsk->sk_wmem_alloc, 1);
1820                 atomic_set(&newsk->sk_omem_alloc, 0);
1821                 sk_init_common(newsk);
1822
1823                 newsk->sk_dst_cache     = NULL;
1824                 newsk->sk_dst_pending_confirm = 0;
1825                 newsk->sk_wmem_queued   = 0;
1826                 newsk->sk_forward_alloc = 0;
1827                 atomic_set(&newsk->sk_drops, 0);
1828                 newsk->sk_send_head     = NULL;
1829                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1830                 atomic_set(&newsk->sk_zckey, 0);
1831
1832                 sock_reset_flag(newsk, SOCK_DONE);
1833                 mem_cgroup_sk_alloc(newsk);
1834                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1835
1836                 rcu_read_lock();
1837                 filter = rcu_dereference(sk->sk_filter);
1838                 if (filter != NULL)
1839                         /* though it's an empty new sock, the charging may fail
1840                          * if sysctl_optmem_max was changed between creation of
1841                          * original socket and cloning
1842                          */
1843                         is_charged = sk_filter_charge(newsk, filter);
1844                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1845                 rcu_read_unlock();
1846
1847                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1848                         /* We need to make sure that we don't uncharge the new
1849                          * socket if we couldn't charge it in the first place
1850                          * as otherwise we uncharge the parent's filter.
1851                          */
1852                         if (!is_charged)
1853                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1854                         sk_free_unlock_clone(newsk);
1855                         newsk = NULL;
1856                         goto out;
1857                 }
1858                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1859
1860                 newsk->sk_err      = 0;
1861                 newsk->sk_err_soft = 0;
1862                 newsk->sk_priority = 0;
1863                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1864                 if (likely(newsk->sk_net_refcnt))
1865                         sock_inuse_add(sock_net(newsk), 1);
1866
1867                 /*
1868                  * Before updating sk_refcnt, we must commit prior changes to memory
1869                  * (Documentation/RCU/rculist_nulls.txt for details)
1870                  */
1871                 smp_wmb();
1872                 refcount_set(&newsk->sk_refcnt, 2);
1873
1874                 /*
1875                  * Increment the counter in the same struct proto as the master
1876                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1877                  * is the same as sk->sk_prot->socks, as this field was copied
1878                  * with memcpy).
1879                  *
1880                  * This _changes_ the previous behaviour, where
1881                  * tcp_create_openreq_child always was incrementing the
1882                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1883                  * to be taken into account in all callers. -acme
1884                  */
1885                 sk_refcnt_debug_inc(newsk);
1886                 sk_set_socket(newsk, NULL);
1887                 RCU_INIT_POINTER(newsk->sk_wq, NULL);
1888
1889                 if (newsk->sk_prot->sockets_allocated)
1890                         sk_sockets_allocated_inc(newsk);
1891
1892                 if (sock_needs_netstamp(sk) &&
1893                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1894                         net_enable_timestamp();
1895         }
1896 out:
1897         return newsk;
1898 }
1899 EXPORT_SYMBOL_GPL(sk_clone_lock);
1900
1901 void sk_free_unlock_clone(struct sock *sk)
1902 {
1903         /* It is still raw copy of parent, so invalidate
1904          * destructor and make plain sk_free() */
1905         sk->sk_destruct = NULL;
1906         bh_unlock_sock(sk);
1907         sk_free(sk);
1908 }
1909 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1910
1911 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1912 {
1913         u32 max_segs = 1;
1914
1915         sk_dst_set(sk, dst);
1916         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1917         if (sk->sk_route_caps & NETIF_F_GSO)
1918                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1919         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1920         if (sk_can_gso(sk)) {
1921                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1922                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1923                 } else {
1924                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1925                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1926                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1927                 }
1928         }
1929         sk->sk_gso_max_segs = max_segs;
1930 }
1931 EXPORT_SYMBOL_GPL(sk_setup_caps);
1932
1933 /*
1934  *      Simple resource managers for sockets.
1935  */
1936
1937
1938 /*
1939  * Write buffer destructor automatically called from kfree_skb.
1940  */
1941 void sock_wfree(struct sk_buff *skb)
1942 {
1943         struct sock *sk = skb->sk;
1944         unsigned int len = skb->truesize;
1945
1946         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1947                 /*
1948                  * Keep a reference on sk_wmem_alloc, this will be released
1949                  * after sk_write_space() call
1950                  */
1951                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1952                 sk->sk_write_space(sk);
1953                 len = 1;
1954         }
1955         /*
1956          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1957          * could not do because of in-flight packets
1958          */
1959         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1960                 __sk_free(sk);
1961 }
1962 EXPORT_SYMBOL(sock_wfree);
1963
1964 /* This variant of sock_wfree() is used by TCP,
1965  * since it sets SOCK_USE_WRITE_QUEUE.
1966  */
1967 void __sock_wfree(struct sk_buff *skb)
1968 {
1969         struct sock *sk = skb->sk;
1970
1971         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1972                 __sk_free(sk);
1973 }
1974
1975 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1976 {
1977         skb_orphan(skb);
1978         skb->sk = sk;
1979 #ifdef CONFIG_INET
1980         if (unlikely(!sk_fullsock(sk))) {
1981                 skb->destructor = sock_edemux;
1982                 sock_hold(sk);
1983                 return;
1984         }
1985 #endif
1986         skb->destructor = sock_wfree;
1987         skb_set_hash_from_sk(skb, sk);
1988         /*
1989          * We used to take a refcount on sk, but following operation
1990          * is enough to guarantee sk_free() wont free this sock until
1991          * all in-flight packets are completed
1992          */
1993         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1994 }
1995 EXPORT_SYMBOL(skb_set_owner_w);
1996
1997 /* This helper is used by netem, as it can hold packets in its
1998  * delay queue. We want to allow the owner socket to send more
1999  * packets, as if they were already TX completed by a typical driver.
2000  * But we also want to keep skb->sk set because some packet schedulers
2001  * rely on it (sch_fq for example).
2002  */
2003 void skb_orphan_partial(struct sk_buff *skb)
2004 {
2005         if (skb_is_tcp_pure_ack(skb))
2006                 return;
2007
2008         if (skb->destructor == sock_wfree
2009 #ifdef CONFIG_INET
2010             || skb->destructor == tcp_wfree
2011 #endif
2012                 ) {
2013                 struct sock *sk = skb->sk;
2014
2015                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2016                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2017                         skb->destructor = sock_efree;
2018                 }
2019         } else {
2020                 skb_orphan(skb);
2021         }
2022 }
2023 EXPORT_SYMBOL(skb_orphan_partial);
2024
2025 /*
2026  * Read buffer destructor automatically called from kfree_skb.
2027  */
2028 void sock_rfree(struct sk_buff *skb)
2029 {
2030         struct sock *sk = skb->sk;
2031         unsigned int len = skb->truesize;
2032
2033         atomic_sub(len, &sk->sk_rmem_alloc);
2034         sk_mem_uncharge(sk, len);
2035 }
2036 EXPORT_SYMBOL(sock_rfree);
2037
2038 /*
2039  * Buffer destructor for skbs that are not used directly in read or write
2040  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2041  */
2042 void sock_efree(struct sk_buff *skb)
2043 {
2044         sock_put(skb->sk);
2045 }
2046 EXPORT_SYMBOL(sock_efree);
2047
2048 kuid_t sock_i_uid(struct sock *sk)
2049 {
2050         kuid_t uid;
2051
2052         read_lock_bh(&sk->sk_callback_lock);
2053         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2054         read_unlock_bh(&sk->sk_callback_lock);
2055         return uid;
2056 }
2057 EXPORT_SYMBOL(sock_i_uid);
2058
2059 unsigned long sock_i_ino(struct sock *sk)
2060 {
2061         unsigned long ino;
2062
2063         read_lock_bh(&sk->sk_callback_lock);
2064         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2065         read_unlock_bh(&sk->sk_callback_lock);
2066         return ino;
2067 }
2068 EXPORT_SYMBOL(sock_i_ino);
2069
2070 /*
2071  * Allocate a skb from the socket's send buffer.
2072  */
2073 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2074                              gfp_t priority)
2075 {
2076         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2077                 struct sk_buff *skb = alloc_skb(size, priority);
2078                 if (skb) {
2079                         skb_set_owner_w(skb, sk);
2080                         return skb;
2081                 }
2082         }
2083         return NULL;
2084 }
2085 EXPORT_SYMBOL(sock_wmalloc);
2086
2087 static void sock_ofree(struct sk_buff *skb)
2088 {
2089         struct sock *sk = skb->sk;
2090
2091         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2092 }
2093
2094 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2095                              gfp_t priority)
2096 {
2097         struct sk_buff *skb;
2098
2099         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2100         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2101             sysctl_optmem_max)
2102                 return NULL;
2103
2104         skb = alloc_skb(size, priority);
2105         if (!skb)
2106                 return NULL;
2107
2108         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2109         skb->sk = sk;
2110         skb->destructor = sock_ofree;
2111         return skb;
2112 }
2113
2114 /*
2115  * Allocate a memory block from the socket's option memory buffer.
2116  */
2117 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2118 {
2119         if ((unsigned int)size <= sysctl_optmem_max &&
2120             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2121                 void *mem;
2122                 /* First do the add, to avoid the race if kmalloc
2123                  * might sleep.
2124                  */
2125                 atomic_add(size, &sk->sk_omem_alloc);
2126                 mem = kmalloc(size, priority);
2127                 if (mem)
2128                         return mem;
2129                 atomic_sub(size, &sk->sk_omem_alloc);
2130         }
2131         return NULL;
2132 }
2133 EXPORT_SYMBOL(sock_kmalloc);
2134
2135 /* Free an option memory block. Note, we actually want the inline
2136  * here as this allows gcc to detect the nullify and fold away the
2137  * condition entirely.
2138  */
2139 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2140                                   const bool nullify)
2141 {
2142         if (WARN_ON_ONCE(!mem))
2143                 return;
2144         if (nullify)
2145                 kzfree(mem);
2146         else
2147                 kfree(mem);
2148         atomic_sub(size, &sk->sk_omem_alloc);
2149 }
2150
2151 void sock_kfree_s(struct sock *sk, void *mem, int size)
2152 {
2153         __sock_kfree_s(sk, mem, size, false);
2154 }
2155 EXPORT_SYMBOL(sock_kfree_s);
2156
2157 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2158 {
2159         __sock_kfree_s(sk, mem, size, true);
2160 }
2161 EXPORT_SYMBOL(sock_kzfree_s);
2162
2163 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2164    I think, these locks should be removed for datagram sockets.
2165  */
2166 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2167 {
2168         DEFINE_WAIT(wait);
2169
2170         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2171         for (;;) {
2172                 if (!timeo)
2173                         break;
2174                 if (signal_pending(current))
2175                         break;
2176                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2177                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2178                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2179                         break;
2180                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2181                         break;
2182                 if (sk->sk_err)
2183                         break;
2184                 timeo = schedule_timeout(timeo);
2185         }
2186         finish_wait(sk_sleep(sk), &wait);
2187         return timeo;
2188 }
2189
2190
2191 /*
2192  *      Generic send/receive buffer handlers
2193  */
2194
2195 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2196                                      unsigned long data_len, int noblock,
2197                                      int *errcode, int max_page_order)
2198 {
2199         struct sk_buff *skb;
2200         long timeo;
2201         int err;
2202
2203         timeo = sock_sndtimeo(sk, noblock);
2204         for (;;) {
2205                 err = sock_error(sk);
2206                 if (err != 0)
2207                         goto failure;
2208
2209                 err = -EPIPE;
2210                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2211                         goto failure;
2212
2213                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2214                         break;
2215
2216                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2217                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2218                 err = -EAGAIN;
2219                 if (!timeo)
2220                         goto failure;
2221                 if (signal_pending(current))
2222                         goto interrupted;
2223                 timeo = sock_wait_for_wmem(sk, timeo);
2224         }
2225         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2226                                    errcode, sk->sk_allocation);
2227         if (skb)
2228                 skb_set_owner_w(skb, sk);
2229         return skb;
2230
2231 interrupted:
2232         err = sock_intr_errno(timeo);
2233 failure:
2234         *errcode = err;
2235         return NULL;
2236 }
2237 EXPORT_SYMBOL(sock_alloc_send_pskb);
2238
2239 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2240                                     int noblock, int *errcode)
2241 {
2242         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2243 }
2244 EXPORT_SYMBOL(sock_alloc_send_skb);
2245
2246 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2247                      struct sockcm_cookie *sockc)
2248 {
2249         u32 tsflags;
2250
2251         switch (cmsg->cmsg_type) {
2252         case SO_MARK:
2253                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2254                         return -EPERM;
2255                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2256                         return -EINVAL;
2257                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2258                 break;
2259         case SO_TIMESTAMPING_OLD:
2260                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2261                         return -EINVAL;
2262
2263                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2264                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2265                         return -EINVAL;
2266
2267                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2268                 sockc->tsflags |= tsflags;
2269                 break;
2270         case SCM_TXTIME:
2271                 if (!sock_flag(sk, SOCK_TXTIME))
2272                         return -EINVAL;
2273                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2274                         return -EINVAL;
2275                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2276                 break;
2277         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2278         case SCM_RIGHTS:
2279         case SCM_CREDENTIALS:
2280                 break;
2281         default:
2282                 return -EINVAL;
2283         }
2284         return 0;
2285 }
2286 EXPORT_SYMBOL(__sock_cmsg_send);
2287
2288 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2289                    struct sockcm_cookie *sockc)
2290 {
2291         struct cmsghdr *cmsg;
2292         int ret;
2293
2294         for_each_cmsghdr(cmsg, msg) {
2295                 if (!CMSG_OK(msg, cmsg))
2296                         return -EINVAL;
2297                 if (cmsg->cmsg_level != SOL_SOCKET)
2298                         continue;
2299                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2300                 if (ret)
2301                         return ret;
2302         }
2303         return 0;
2304 }
2305 EXPORT_SYMBOL(sock_cmsg_send);
2306
2307 static void sk_enter_memory_pressure(struct sock *sk)
2308 {
2309         if (!sk->sk_prot->enter_memory_pressure)
2310                 return;
2311
2312         sk->sk_prot->enter_memory_pressure(sk);
2313 }
2314
2315 static void sk_leave_memory_pressure(struct sock *sk)
2316 {
2317         if (sk->sk_prot->leave_memory_pressure) {
2318                 sk->sk_prot->leave_memory_pressure(sk);
2319         } else {
2320                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2321
2322                 if (memory_pressure && *memory_pressure)
2323                         *memory_pressure = 0;
2324         }
2325 }
2326
2327 /* On 32bit arches, an skb frag is limited to 2^15 */
2328 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2329
2330 /**
2331  * skb_page_frag_refill - check that a page_frag contains enough room
2332  * @sz: minimum size of the fragment we want to get
2333  * @pfrag: pointer to page_frag
2334  * @gfp: priority for memory allocation
2335  *
2336  * Note: While this allocator tries to use high order pages, there is
2337  * no guarantee that allocations succeed. Therefore, @sz MUST be
2338  * less or equal than PAGE_SIZE.
2339  */
2340 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2341 {
2342         if (pfrag->page) {
2343                 if (page_ref_count(pfrag->page) == 1) {
2344                         pfrag->offset = 0;
2345                         return true;
2346                 }
2347                 if (pfrag->offset + sz <= pfrag->size)
2348                         return true;
2349                 put_page(pfrag->page);
2350         }
2351
2352         pfrag->offset = 0;
2353         if (SKB_FRAG_PAGE_ORDER) {
2354                 /* Avoid direct reclaim but allow kswapd to wake */
2355                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2356                                           __GFP_COMP | __GFP_NOWARN |
2357                                           __GFP_NORETRY,
2358                                           SKB_FRAG_PAGE_ORDER);
2359                 if (likely(pfrag->page)) {
2360                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2361                         return true;
2362                 }
2363         }
2364         pfrag->page = alloc_page(gfp);
2365         if (likely(pfrag->page)) {
2366                 pfrag->size = PAGE_SIZE;
2367                 return true;
2368         }
2369         return false;
2370 }
2371 EXPORT_SYMBOL(skb_page_frag_refill);
2372
2373 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2374 {
2375         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2376                 return true;
2377
2378         sk_enter_memory_pressure(sk);
2379         sk_stream_moderate_sndbuf(sk);
2380         return false;
2381 }
2382 EXPORT_SYMBOL(sk_page_frag_refill);
2383
2384 static void __lock_sock(struct sock *sk)
2385         __releases(&sk->sk_lock.slock)
2386         __acquires(&sk->sk_lock.slock)
2387 {
2388         DEFINE_WAIT(wait);
2389
2390         for (;;) {
2391                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2392                                         TASK_UNINTERRUPTIBLE);
2393                 spin_unlock_bh(&sk->sk_lock.slock);
2394                 schedule();
2395                 spin_lock_bh(&sk->sk_lock.slock);
2396                 if (!sock_owned_by_user(sk))
2397                         break;
2398         }
2399         finish_wait(&sk->sk_lock.wq, &wait);
2400 }
2401
2402 void __release_sock(struct sock *sk)
2403         __releases(&sk->sk_lock.slock)
2404         __acquires(&sk->sk_lock.slock)
2405 {
2406         struct sk_buff *skb, *next;
2407
2408         while ((skb = sk->sk_backlog.head) != NULL) {
2409                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2410
2411                 spin_unlock_bh(&sk->sk_lock.slock);
2412
2413                 do {
2414                         next = skb->next;
2415                         prefetch(next);
2416                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2417                         skb_mark_not_on_list(skb);
2418                         sk_backlog_rcv(sk, skb);
2419
2420                         cond_resched();
2421
2422                         skb = next;
2423                 } while (skb != NULL);
2424
2425                 spin_lock_bh(&sk->sk_lock.slock);
2426         }
2427
2428         /*
2429          * Doing the zeroing here guarantee we can not loop forever
2430          * while a wild producer attempts to flood us.
2431          */
2432         sk->sk_backlog.len = 0;
2433 }
2434
2435 void __sk_flush_backlog(struct sock *sk)
2436 {
2437         spin_lock_bh(&sk->sk_lock.slock);
2438         __release_sock(sk);
2439         spin_unlock_bh(&sk->sk_lock.slock);
2440 }
2441
2442 /**
2443  * sk_wait_data - wait for data to arrive at sk_receive_queue
2444  * @sk:    sock to wait on
2445  * @timeo: for how long
2446  * @skb:   last skb seen on sk_receive_queue
2447  *
2448  * Now socket state including sk->sk_err is changed only under lock,
2449  * hence we may omit checks after joining wait queue.
2450  * We check receive queue before schedule() only as optimization;
2451  * it is very likely that release_sock() added new data.
2452  */
2453 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2454 {
2455         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2456         int rc;
2457
2458         add_wait_queue(sk_sleep(sk), &wait);
2459         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2460         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2461         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2462         remove_wait_queue(sk_sleep(sk), &wait);
2463         return rc;
2464 }
2465 EXPORT_SYMBOL(sk_wait_data);
2466
2467 /**
2468  *      __sk_mem_raise_allocated - increase memory_allocated
2469  *      @sk: socket
2470  *      @size: memory size to allocate
2471  *      @amt: pages to allocate
2472  *      @kind: allocation type
2473  *
2474  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2475  */
2476 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2477 {
2478         struct proto *prot = sk->sk_prot;
2479         long allocated = sk_memory_allocated_add(sk, amt);
2480         bool charged = true;
2481
2482         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2483             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2484                 goto suppress_allocation;
2485
2486         /* Under limit. */
2487         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2488                 sk_leave_memory_pressure(sk);
2489                 return 1;
2490         }
2491
2492         /* Under pressure. */
2493         if (allocated > sk_prot_mem_limits(sk, 1))
2494                 sk_enter_memory_pressure(sk);
2495
2496         /* Over hard limit. */
2497         if (allocated > sk_prot_mem_limits(sk, 2))
2498                 goto suppress_allocation;
2499
2500         /* guarantee minimum buffer size under pressure */
2501         if (kind == SK_MEM_RECV) {
2502                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2503                         return 1;
2504
2505         } else { /* SK_MEM_SEND */
2506                 int wmem0 = sk_get_wmem0(sk, prot);
2507
2508                 if (sk->sk_type == SOCK_STREAM) {
2509                         if (sk->sk_wmem_queued < wmem0)
2510                                 return 1;
2511                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2512                                 return 1;
2513                 }
2514         }
2515
2516         if (sk_has_memory_pressure(sk)) {
2517                 u64 alloc;
2518
2519                 if (!sk_under_memory_pressure(sk))
2520                         return 1;
2521                 alloc = sk_sockets_allocated_read_positive(sk);
2522                 if (sk_prot_mem_limits(sk, 2) > alloc *
2523                     sk_mem_pages(sk->sk_wmem_queued +
2524                                  atomic_read(&sk->sk_rmem_alloc) +
2525                                  sk->sk_forward_alloc))
2526                         return 1;
2527         }
2528
2529 suppress_allocation:
2530
2531         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2532                 sk_stream_moderate_sndbuf(sk);
2533
2534                 /* Fail only if socket is _under_ its sndbuf.
2535                  * In this case we cannot block, so that we have to fail.
2536                  */
2537                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2538                         return 1;
2539         }
2540
2541         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2542                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2543
2544         sk_memory_allocated_sub(sk, amt);
2545
2546         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2547                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2548
2549         return 0;
2550 }
2551 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2552
2553 /**
2554  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2555  *      @sk: socket
2556  *      @size: memory size to allocate
2557  *      @kind: allocation type
2558  *
2559  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2560  *      rmem allocation. This function assumes that protocols which have
2561  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2562  */
2563 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2564 {
2565         int ret, amt = sk_mem_pages(size);
2566
2567         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2568         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2569         if (!ret)
2570                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2571         return ret;
2572 }
2573 EXPORT_SYMBOL(__sk_mem_schedule);
2574
2575 /**
2576  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2577  *      @sk: socket
2578  *      @amount: number of quanta
2579  *
2580  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2581  */
2582 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2583 {
2584         sk_memory_allocated_sub(sk, amount);
2585
2586         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2587                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2588
2589         if (sk_under_memory_pressure(sk) &&
2590             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2591                 sk_leave_memory_pressure(sk);
2592 }
2593 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2594
2595 /**
2596  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2597  *      @sk: socket
2598  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2599  */
2600 void __sk_mem_reclaim(struct sock *sk, int amount)
2601 {
2602         amount >>= SK_MEM_QUANTUM_SHIFT;
2603         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2604         __sk_mem_reduce_allocated(sk, amount);
2605 }
2606 EXPORT_SYMBOL(__sk_mem_reclaim);
2607
2608 int sk_set_peek_off(struct sock *sk, int val)
2609 {
2610         sk->sk_peek_off = val;
2611         return 0;
2612 }
2613 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2614
2615 /*
2616  * Set of default routines for initialising struct proto_ops when
2617  * the protocol does not support a particular function. In certain
2618  * cases where it makes no sense for a protocol to have a "do nothing"
2619  * function, some default processing is provided.
2620  */
2621
2622 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2623 {
2624         return -EOPNOTSUPP;
2625 }
2626 EXPORT_SYMBOL(sock_no_bind);
2627
2628 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2629                     int len, int flags)
2630 {
2631         return -EOPNOTSUPP;
2632 }
2633 EXPORT_SYMBOL(sock_no_connect);
2634
2635 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2636 {
2637         return -EOPNOTSUPP;
2638 }
2639 EXPORT_SYMBOL(sock_no_socketpair);
2640
2641 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2642                    bool kern)
2643 {
2644         return -EOPNOTSUPP;
2645 }
2646 EXPORT_SYMBOL(sock_no_accept);
2647
2648 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2649                     int peer)
2650 {
2651         return -EOPNOTSUPP;
2652 }
2653 EXPORT_SYMBOL(sock_no_getname);
2654
2655 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2656 {
2657         return -EOPNOTSUPP;
2658 }
2659 EXPORT_SYMBOL(sock_no_ioctl);
2660
2661 int sock_no_listen(struct socket *sock, int backlog)
2662 {
2663         return -EOPNOTSUPP;
2664 }
2665 EXPORT_SYMBOL(sock_no_listen);
2666
2667 int sock_no_shutdown(struct socket *sock, int how)
2668 {
2669         return -EOPNOTSUPP;
2670 }
2671 EXPORT_SYMBOL(sock_no_shutdown);
2672
2673 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2674                     char __user *optval, unsigned int optlen)
2675 {
2676         return -EOPNOTSUPP;
2677 }
2678 EXPORT_SYMBOL(sock_no_setsockopt);
2679
2680 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2681                     char __user *optval, int __user *optlen)
2682 {
2683         return -EOPNOTSUPP;
2684 }
2685 EXPORT_SYMBOL(sock_no_getsockopt);
2686
2687 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2688 {
2689         return -EOPNOTSUPP;
2690 }
2691 EXPORT_SYMBOL(sock_no_sendmsg);
2692
2693 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2694 {
2695         return -EOPNOTSUPP;
2696 }
2697 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2698
2699 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2700                     int flags)
2701 {
2702         return -EOPNOTSUPP;
2703 }
2704 EXPORT_SYMBOL(sock_no_recvmsg);
2705
2706 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2707 {
2708         /* Mirror missing mmap method error code */
2709         return -ENODEV;
2710 }
2711 EXPORT_SYMBOL(sock_no_mmap);
2712
2713 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2714 {
2715         ssize_t res;
2716         struct msghdr msg = {.msg_flags = flags};
2717         struct kvec iov;
2718         char *kaddr = kmap(page);
2719         iov.iov_base = kaddr + offset;
2720         iov.iov_len = size;
2721         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2722         kunmap(page);
2723         return res;
2724 }
2725 EXPORT_SYMBOL(sock_no_sendpage);
2726
2727 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2728                                 int offset, size_t size, int flags)
2729 {
2730         ssize_t res;
2731         struct msghdr msg = {.msg_flags = flags};
2732         struct kvec iov;
2733         char *kaddr = kmap(page);
2734
2735         iov.iov_base = kaddr + offset;
2736         iov.iov_len = size;
2737         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2738         kunmap(page);
2739         return res;
2740 }
2741 EXPORT_SYMBOL(sock_no_sendpage_locked);
2742
2743 /*
2744  *      Default Socket Callbacks
2745  */
2746
2747 static void sock_def_wakeup(struct sock *sk)
2748 {
2749         struct socket_wq *wq;
2750
2751         rcu_read_lock();
2752         wq = rcu_dereference(sk->sk_wq);
2753         if (skwq_has_sleeper(wq))
2754                 wake_up_interruptible_all(&wq->wait);
2755         rcu_read_unlock();
2756 }
2757
2758 static void sock_def_error_report(struct sock *sk)
2759 {
2760         struct socket_wq *wq;
2761
2762         rcu_read_lock();
2763         wq = rcu_dereference(sk->sk_wq);
2764         if (skwq_has_sleeper(wq))
2765                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2766         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2767         rcu_read_unlock();
2768 }
2769
2770 static void sock_def_readable(struct sock *sk)
2771 {
2772         struct socket_wq *wq;
2773
2774         rcu_read_lock();
2775         wq = rcu_dereference(sk->sk_wq);
2776         if (skwq_has_sleeper(wq))
2777                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2778                                                 EPOLLRDNORM | EPOLLRDBAND);
2779         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2780         rcu_read_unlock();
2781 }
2782
2783 static void sock_def_write_space(struct sock *sk)
2784 {
2785         struct socket_wq *wq;
2786
2787         rcu_read_lock();
2788
2789         /* Do not wake up a writer until he can make "significant"
2790          * progress.  --DaveM
2791          */
2792         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2793                 wq = rcu_dereference(sk->sk_wq);
2794                 if (skwq_has_sleeper(wq))
2795                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2796                                                 EPOLLWRNORM | EPOLLWRBAND);
2797
2798                 /* Should agree with poll, otherwise some programs break */
2799                 if (sock_writeable(sk))
2800                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2801         }
2802
2803         rcu_read_unlock();
2804 }
2805
2806 static void sock_def_destruct(struct sock *sk)
2807 {
2808 }
2809
2810 void sk_send_sigurg(struct sock *sk)
2811 {
2812         if (sk->sk_socket && sk->sk_socket->file)
2813                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2814                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2815 }
2816 EXPORT_SYMBOL(sk_send_sigurg);
2817
2818 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2819                     unsigned long expires)
2820 {
2821         if (!mod_timer(timer, expires))
2822                 sock_hold(sk);
2823 }
2824 EXPORT_SYMBOL(sk_reset_timer);
2825
2826 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2827 {
2828         if (del_timer(timer))
2829                 __sock_put(sk);
2830 }
2831 EXPORT_SYMBOL(sk_stop_timer);
2832
2833 void sock_init_data(struct socket *sock, struct sock *sk)
2834 {
2835         sk_init_common(sk);
2836         sk->sk_send_head        =       NULL;
2837
2838         timer_setup(&sk->sk_timer, NULL, 0);
2839
2840         sk->sk_allocation       =       GFP_KERNEL;
2841         sk->sk_rcvbuf           =       sysctl_rmem_default;
2842         sk->sk_sndbuf           =       sysctl_wmem_default;
2843         sk->sk_state            =       TCP_CLOSE;
2844         sk_set_socket(sk, sock);
2845
2846         sock_set_flag(sk, SOCK_ZAPPED);
2847
2848         if (sock) {
2849                 sk->sk_type     =       sock->type;
2850                 RCU_INIT_POINTER(sk->sk_wq, sock->wq);
2851                 sock->sk        =       sk;
2852                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2853         } else {
2854                 RCU_INIT_POINTER(sk->sk_wq, NULL);
2855                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2856         }
2857
2858         rwlock_init(&sk->sk_callback_lock);
2859         if (sk->sk_kern_sock)
2860                 lockdep_set_class_and_name(
2861                         &sk->sk_callback_lock,
2862                         af_kern_callback_keys + sk->sk_family,
2863                         af_family_kern_clock_key_strings[sk->sk_family]);
2864         else
2865                 lockdep_set_class_and_name(
2866                         &sk->sk_callback_lock,
2867                         af_callback_keys + sk->sk_family,
2868                         af_family_clock_key_strings[sk->sk_family]);
2869
2870         sk->sk_state_change     =       sock_def_wakeup;
2871         sk->sk_data_ready       =       sock_def_readable;
2872         sk->sk_write_space      =       sock_def_write_space;
2873         sk->sk_error_report     =       sock_def_error_report;
2874         sk->sk_destruct         =       sock_def_destruct;
2875
2876         sk->sk_frag.page        =       NULL;
2877         sk->sk_frag.offset      =       0;
2878         sk->sk_peek_off         =       -1;
2879
2880         sk->sk_peer_pid         =       NULL;
2881         sk->sk_peer_cred        =       NULL;
2882         sk->sk_write_pending    =       0;
2883         sk->sk_rcvlowat         =       1;
2884         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2885         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2886
2887         sk->sk_stamp = SK_DEFAULT_STAMP;
2888 #if BITS_PER_LONG==32
2889         seqlock_init(&sk->sk_stamp_seq);
2890 #endif
2891         atomic_set(&sk->sk_zckey, 0);
2892
2893 #ifdef CONFIG_NET_RX_BUSY_POLL
2894         sk->sk_napi_id          =       0;
2895         sk->sk_ll_usec          =       sysctl_net_busy_read;
2896 #endif
2897
2898         sk->sk_max_pacing_rate = ~0UL;
2899         sk->sk_pacing_rate = ~0UL;
2900         sk->sk_pacing_shift = 10;
2901         sk->sk_incoming_cpu = -1;
2902
2903         sk_rx_queue_clear(sk);
2904         /*
2905          * Before updating sk_refcnt, we must commit prior changes to memory
2906          * (Documentation/RCU/rculist_nulls.txt for details)
2907          */
2908         smp_wmb();
2909         refcount_set(&sk->sk_refcnt, 1);
2910         atomic_set(&sk->sk_drops, 0);
2911 }
2912 EXPORT_SYMBOL(sock_init_data);
2913
2914 void lock_sock_nested(struct sock *sk, int subclass)
2915 {
2916         might_sleep();
2917         spin_lock_bh(&sk->sk_lock.slock);
2918         if (sk->sk_lock.owned)
2919                 __lock_sock(sk);
2920         sk->sk_lock.owned = 1;
2921         spin_unlock(&sk->sk_lock.slock);
2922         /*
2923          * The sk_lock has mutex_lock() semantics here:
2924          */
2925         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2926         local_bh_enable();
2927 }
2928 EXPORT_SYMBOL(lock_sock_nested);
2929
2930 void release_sock(struct sock *sk)
2931 {
2932         spin_lock_bh(&sk->sk_lock.slock);
2933         if (sk->sk_backlog.tail)
2934                 __release_sock(sk);
2935
2936         /* Warning : release_cb() might need to release sk ownership,
2937          * ie call sock_release_ownership(sk) before us.
2938          */
2939         if (sk->sk_prot->release_cb)
2940                 sk->sk_prot->release_cb(sk);
2941
2942         sock_release_ownership(sk);
2943         if (waitqueue_active(&sk->sk_lock.wq))
2944                 wake_up(&sk->sk_lock.wq);
2945         spin_unlock_bh(&sk->sk_lock.slock);
2946 }
2947 EXPORT_SYMBOL(release_sock);
2948
2949 /**
2950  * lock_sock_fast - fast version of lock_sock
2951  * @sk: socket
2952  *
2953  * This version should be used for very small section, where process wont block
2954  * return false if fast path is taken:
2955  *
2956  *   sk_lock.slock locked, owned = 0, BH disabled
2957  *
2958  * return true if slow path is taken:
2959  *
2960  *   sk_lock.slock unlocked, owned = 1, BH enabled
2961  */
2962 bool lock_sock_fast(struct sock *sk)
2963 {
2964         might_sleep();
2965         spin_lock_bh(&sk->sk_lock.slock);
2966
2967         if (!sk->sk_lock.owned)
2968                 /*
2969                  * Note : We must disable BH
2970                  */
2971                 return false;
2972
2973         __lock_sock(sk);
2974         sk->sk_lock.owned = 1;
2975         spin_unlock(&sk->sk_lock.slock);
2976         /*
2977          * The sk_lock has mutex_lock() semantics here:
2978          */
2979         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2980         local_bh_enable();
2981         return true;
2982 }
2983 EXPORT_SYMBOL(lock_sock_fast);
2984
2985 int sock_gettstamp(struct socket *sock, void __user *userstamp,
2986                    bool timeval, bool time32)
2987 {
2988         struct sock *sk = sock->sk;
2989         struct timespec64 ts;
2990
2991         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2992         ts = ktime_to_timespec64(sock_read_timestamp(sk));
2993         if (ts.tv_sec == -1)
2994                 return -ENOENT;
2995         if (ts.tv_sec == 0) {
2996                 ktime_t kt = ktime_get_real();
2997                 sock_write_timestamp(sk, kt);;
2998                 ts = ktime_to_timespec64(kt);
2999         }
3000
3001         if (timeval)
3002                 ts.tv_nsec /= 1000;
3003
3004 #ifdef CONFIG_COMPAT_32BIT_TIME
3005         if (time32)
3006                 return put_old_timespec32(&ts, userstamp);
3007 #endif
3008 #ifdef CONFIG_SPARC64
3009         /* beware of padding in sparc64 timeval */
3010         if (timeval && !in_compat_syscall()) {
3011                 struct __kernel_old_timeval __user tv = {
3012                         .tv_sec = ts.tv_sec,
3013                         .tv_usec = ts.tv_nsec,
3014                 };
3015                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3016                         return -EFAULT;
3017                 return 0;
3018         }
3019 #endif
3020         return put_timespec64(&ts, userstamp);
3021 }
3022 EXPORT_SYMBOL(sock_gettstamp);
3023
3024 void sock_enable_timestamp(struct sock *sk, int flag)
3025 {
3026         if (!sock_flag(sk, flag)) {
3027                 unsigned long previous_flags = sk->sk_flags;
3028
3029                 sock_set_flag(sk, flag);
3030                 /*
3031                  * we just set one of the two flags which require net
3032                  * time stamping, but time stamping might have been on
3033                  * already because of the other one
3034                  */
3035                 if (sock_needs_netstamp(sk) &&
3036                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3037                         net_enable_timestamp();
3038         }
3039 }
3040
3041 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3042                        int level, int type)
3043 {
3044         struct sock_exterr_skb *serr;
3045         struct sk_buff *skb;
3046         int copied, err;
3047
3048         err = -EAGAIN;
3049         skb = sock_dequeue_err_skb(sk);
3050         if (skb == NULL)
3051                 goto out;
3052
3053         copied = skb->len;
3054         if (copied > len) {
3055                 msg->msg_flags |= MSG_TRUNC;
3056                 copied = len;
3057         }
3058         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3059         if (err)
3060                 goto out_free_skb;
3061
3062         sock_recv_timestamp(msg, sk, skb);
3063
3064         serr = SKB_EXT_ERR(skb);
3065         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3066
3067         msg->msg_flags |= MSG_ERRQUEUE;
3068         err = copied;
3069
3070 out_free_skb:
3071         kfree_skb(skb);
3072 out:
3073         return err;
3074 }
3075 EXPORT_SYMBOL(sock_recv_errqueue);
3076
3077 /*
3078  *      Get a socket option on an socket.
3079  *
3080  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3081  *      asynchronous errors should be reported by getsockopt. We assume
3082  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3083  */
3084 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3085                            char __user *optval, int __user *optlen)
3086 {
3087         struct sock *sk = sock->sk;
3088
3089         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3090 }
3091 EXPORT_SYMBOL(sock_common_getsockopt);
3092
3093 #ifdef CONFIG_COMPAT
3094 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3095                                   char __user *optval, int __user *optlen)
3096 {
3097         struct sock *sk = sock->sk;
3098
3099         if (sk->sk_prot->compat_getsockopt != NULL)
3100                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3101                                                       optval, optlen);
3102         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3103 }
3104 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3105 #endif
3106
3107 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3108                         int flags)
3109 {
3110         struct sock *sk = sock->sk;
3111         int addr_len = 0;
3112         int err;
3113
3114         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3115                                    flags & ~MSG_DONTWAIT, &addr_len);
3116         if (err >= 0)
3117                 msg->msg_namelen = addr_len;
3118         return err;
3119 }
3120 EXPORT_SYMBOL(sock_common_recvmsg);
3121
3122 /*
3123  *      Set socket options on an inet socket.
3124  */
3125 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3126                            char __user *optval, unsigned int optlen)
3127 {
3128         struct sock *sk = sock->sk;
3129
3130         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3131 }
3132 EXPORT_SYMBOL(sock_common_setsockopt);
3133
3134 #ifdef CONFIG_COMPAT
3135 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3136                                   char __user *optval, unsigned int optlen)
3137 {
3138         struct sock *sk = sock->sk;
3139
3140         if (sk->sk_prot->compat_setsockopt != NULL)
3141                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3142                                                       optval, optlen);
3143         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3144 }
3145 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3146 #endif
3147
3148 void sk_common_release(struct sock *sk)
3149 {
3150         if (sk->sk_prot->destroy)
3151                 sk->sk_prot->destroy(sk);
3152
3153         /*
3154          * Observation: when sock_common_release is called, processes have
3155          * no access to socket. But net still has.
3156          * Step one, detach it from networking:
3157          *
3158          * A. Remove from hash tables.
3159          */
3160
3161         sk->sk_prot->unhash(sk);
3162
3163         /*
3164          * In this point socket cannot receive new packets, but it is possible
3165          * that some packets are in flight because some CPU runs receiver and
3166          * did hash table lookup before we unhashed socket. They will achieve
3167          * receive queue and will be purged by socket destructor.
3168          *
3169          * Also we still have packets pending on receive queue and probably,
3170          * our own packets waiting in device queues. sock_destroy will drain
3171          * receive queue, but transmitted packets will delay socket destruction
3172          * until the last reference will be released.
3173          */
3174
3175         sock_orphan(sk);
3176
3177         xfrm_sk_free_policy(sk);
3178
3179         sk_refcnt_debug_release(sk);
3180
3181         sock_put(sk);
3182 }
3183 EXPORT_SYMBOL(sk_common_release);
3184
3185 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3186 {
3187         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3188
3189         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3190         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3191         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3192         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3193         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3194         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3195         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3196         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3197         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3198 }
3199
3200 #ifdef CONFIG_PROC_FS
3201 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3202 struct prot_inuse {
3203         int val[PROTO_INUSE_NR];
3204 };
3205
3206 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3207
3208 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3209 {
3210         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3211 }
3212 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3213
3214 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3215 {
3216         int cpu, idx = prot->inuse_idx;
3217         int res = 0;
3218
3219         for_each_possible_cpu(cpu)
3220                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3221
3222         return res >= 0 ? res : 0;
3223 }
3224 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3225
3226 static void sock_inuse_add(struct net *net, int val)
3227 {
3228         this_cpu_add(*net->core.sock_inuse, val);
3229 }
3230
3231 int sock_inuse_get(struct net *net)
3232 {
3233         int cpu, res = 0;
3234
3235         for_each_possible_cpu(cpu)
3236                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3237
3238         return res;
3239 }
3240
3241 EXPORT_SYMBOL_GPL(sock_inuse_get);
3242
3243 static int __net_init sock_inuse_init_net(struct net *net)
3244 {
3245         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3246         if (net->core.prot_inuse == NULL)
3247                 return -ENOMEM;
3248
3249         net->core.sock_inuse = alloc_percpu(int);
3250         if (net->core.sock_inuse == NULL)
3251                 goto out;
3252
3253         return 0;
3254
3255 out:
3256         free_percpu(net->core.prot_inuse);
3257         return -ENOMEM;
3258 }
3259
3260 static void __net_exit sock_inuse_exit_net(struct net *net)
3261 {
3262         free_percpu(net->core.prot_inuse);
3263         free_percpu(net->core.sock_inuse);
3264 }
3265
3266 static struct pernet_operations net_inuse_ops = {
3267         .init = sock_inuse_init_net,
3268         .exit = sock_inuse_exit_net,
3269 };
3270
3271 static __init int net_inuse_init(void)
3272 {
3273         if (register_pernet_subsys(&net_inuse_ops))
3274                 panic("Cannot initialize net inuse counters");
3275
3276         return 0;
3277 }
3278
3279 core_initcall(net_inuse_init);
3280
3281 static void assign_proto_idx(struct proto *prot)
3282 {
3283         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3284
3285         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3286                 pr_err("PROTO_INUSE_NR exhausted\n");
3287                 return;
3288         }
3289
3290         set_bit(prot->inuse_idx, proto_inuse_idx);
3291 }
3292
3293 static void release_proto_idx(struct proto *prot)
3294 {
3295         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3296                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3297 }
3298 #else
3299 static inline void assign_proto_idx(struct proto *prot)
3300 {
3301 }
3302
3303 static inline void release_proto_idx(struct proto *prot)
3304 {
3305 }
3306
3307 static void sock_inuse_add(struct net *net, int val)
3308 {
3309 }
3310 #endif
3311
3312 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3313 {
3314         if (!rsk_prot)
3315                 return;
3316         kfree(rsk_prot->slab_name);
3317         rsk_prot->slab_name = NULL;
3318         kmem_cache_destroy(rsk_prot->slab);
3319         rsk_prot->slab = NULL;
3320 }
3321
3322 static int req_prot_init(const struct proto *prot)
3323 {
3324         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3325
3326         if (!rsk_prot)
3327                 return 0;
3328
3329         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3330                                         prot->name);
3331         if (!rsk_prot->slab_name)
3332                 return -ENOMEM;
3333
3334         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3335                                            rsk_prot->obj_size, 0,
3336                                            SLAB_ACCOUNT | prot->slab_flags,
3337                                            NULL);
3338
3339         if (!rsk_prot->slab) {
3340                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3341                         prot->name);
3342                 return -ENOMEM;
3343         }
3344         return 0;
3345 }
3346
3347 int proto_register(struct proto *prot, int alloc_slab)
3348 {
3349         if (alloc_slab) {
3350                 prot->slab = kmem_cache_create_usercopy(prot->name,
3351                                         prot->obj_size, 0,
3352                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3353                                         prot->slab_flags,
3354                                         prot->useroffset, prot->usersize,
3355                                         NULL);
3356
3357                 if (prot->slab == NULL) {
3358                         pr_crit("%s: Can't create sock SLAB cache!\n",
3359                                 prot->name);
3360                         goto out;
3361                 }
3362
3363                 if (req_prot_init(prot))
3364                         goto out_free_request_sock_slab;
3365
3366                 if (prot->twsk_prot != NULL) {
3367                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3368
3369                         if (prot->twsk_prot->twsk_slab_name == NULL)
3370                                 goto out_free_request_sock_slab;
3371
3372                         prot->twsk_prot->twsk_slab =
3373                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3374                                                   prot->twsk_prot->twsk_obj_size,
3375                                                   0,
3376                                                   SLAB_ACCOUNT |
3377                                                   prot->slab_flags,
3378                                                   NULL);
3379                         if (prot->twsk_prot->twsk_slab == NULL)
3380                                 goto out_free_timewait_sock_slab_name;
3381                 }
3382         }
3383
3384         mutex_lock(&proto_list_mutex);
3385         list_add(&prot->node, &proto_list);
3386         assign_proto_idx(prot);
3387         mutex_unlock(&proto_list_mutex);
3388         return 0;
3389
3390 out_free_timewait_sock_slab_name:
3391         kfree(prot->twsk_prot->twsk_slab_name);
3392 out_free_request_sock_slab:
3393         req_prot_cleanup(prot->rsk_prot);
3394
3395         kmem_cache_destroy(prot->slab);
3396         prot->slab = NULL;
3397 out:
3398         return -ENOBUFS;
3399 }
3400 EXPORT_SYMBOL(proto_register);
3401
3402 void proto_unregister(struct proto *prot)
3403 {
3404         mutex_lock(&proto_list_mutex);
3405         release_proto_idx(prot);
3406         list_del(&prot->node);
3407         mutex_unlock(&proto_list_mutex);
3408
3409         kmem_cache_destroy(prot->slab);
3410         prot->slab = NULL;
3411
3412         req_prot_cleanup(prot->rsk_prot);
3413
3414         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3415                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3416                 kfree(prot->twsk_prot->twsk_slab_name);
3417                 prot->twsk_prot->twsk_slab = NULL;
3418         }
3419 }
3420 EXPORT_SYMBOL(proto_unregister);
3421
3422 int sock_load_diag_module(int family, int protocol)
3423 {
3424         if (!protocol) {
3425                 if (!sock_is_registered(family))
3426                         return -ENOENT;
3427
3428                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3429                                       NETLINK_SOCK_DIAG, family);
3430         }
3431
3432 #ifdef CONFIG_INET
3433         if (family == AF_INET &&
3434             protocol != IPPROTO_RAW &&
3435             !rcu_access_pointer(inet_protos[protocol]))
3436                 return -ENOENT;
3437 #endif
3438
3439         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3440                               NETLINK_SOCK_DIAG, family, protocol);
3441 }
3442 EXPORT_SYMBOL(sock_load_diag_module);
3443
3444 #ifdef CONFIG_PROC_FS
3445 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3446         __acquires(proto_list_mutex)
3447 {
3448         mutex_lock(&proto_list_mutex);
3449         return seq_list_start_head(&proto_list, *pos);
3450 }
3451
3452 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3453 {
3454         return seq_list_next(v, &proto_list, pos);
3455 }
3456
3457 static void proto_seq_stop(struct seq_file *seq, void *v)
3458         __releases(proto_list_mutex)
3459 {
3460         mutex_unlock(&proto_list_mutex);
3461 }
3462
3463 static char proto_method_implemented(const void *method)
3464 {
3465         return method == NULL ? 'n' : 'y';
3466 }
3467 static long sock_prot_memory_allocated(struct proto *proto)
3468 {
3469         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3470 }
3471
3472 static char *sock_prot_memory_pressure(struct proto *proto)
3473 {
3474         return proto->memory_pressure != NULL ?
3475         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3476 }
3477
3478 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3479 {
3480
3481         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3482                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3483                    proto->name,
3484                    proto->obj_size,
3485                    sock_prot_inuse_get(seq_file_net(seq), proto),
3486                    sock_prot_memory_allocated(proto),
3487                    sock_prot_memory_pressure(proto),
3488                    proto->max_header,
3489                    proto->slab == NULL ? "no" : "yes",
3490                    module_name(proto->owner),
3491                    proto_method_implemented(proto->close),
3492                    proto_method_implemented(proto->connect),
3493                    proto_method_implemented(proto->disconnect),
3494                    proto_method_implemented(proto->accept),
3495                    proto_method_implemented(proto->ioctl),
3496                    proto_method_implemented(proto->init),
3497                    proto_method_implemented(proto->destroy),
3498                    proto_method_implemented(proto->shutdown),
3499                    proto_method_implemented(proto->setsockopt),
3500                    proto_method_implemented(proto->getsockopt),
3501                    proto_method_implemented(proto->sendmsg),
3502                    proto_method_implemented(proto->recvmsg),
3503                    proto_method_implemented(proto->sendpage),
3504                    proto_method_implemented(proto->bind),
3505                    proto_method_implemented(proto->backlog_rcv),
3506                    proto_method_implemented(proto->hash),
3507                    proto_method_implemented(proto->unhash),
3508                    proto_method_implemented(proto->get_port),
3509                    proto_method_implemented(proto->enter_memory_pressure));
3510 }
3511
3512 static int proto_seq_show(struct seq_file *seq, void *v)
3513 {
3514         if (v == &proto_list)
3515                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3516                            "protocol",
3517                            "size",
3518                            "sockets",
3519                            "memory",
3520                            "press",
3521                            "maxhdr",
3522                            "slab",
3523                            "module",
3524                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3525         else
3526                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3527         return 0;
3528 }
3529
3530 static const struct seq_operations proto_seq_ops = {
3531         .start  = proto_seq_start,
3532         .next   = proto_seq_next,
3533         .stop   = proto_seq_stop,
3534         .show   = proto_seq_show,
3535 };
3536
3537 static __net_init int proto_init_net(struct net *net)
3538 {
3539         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3540                         sizeof(struct seq_net_private)))
3541                 return -ENOMEM;
3542
3543         return 0;
3544 }
3545
3546 static __net_exit void proto_exit_net(struct net *net)
3547 {
3548         remove_proc_entry("protocols", net->proc_net);
3549 }
3550
3551
3552 static __net_initdata struct pernet_operations proto_net_ops = {
3553         .init = proto_init_net,
3554         .exit = proto_exit_net,
3555 };
3556
3557 static int __init proto_init(void)
3558 {
3559         return register_pernet_subsys(&proto_net_ops);
3560 }
3561
3562 subsys_initcall(proto_init);
3563
3564 #endif /* PROC_FS */
3565
3566 #ifdef CONFIG_NET_RX_BUSY_POLL
3567 bool sk_busy_loop_end(void *p, unsigned long start_time)
3568 {
3569         struct sock *sk = p;
3570
3571         return !skb_queue_empty(&sk->sk_receive_queue) ||
3572                sk_busy_loop_timeout(sk, start_time);
3573 }
3574 EXPORT_SYMBOL(sk_busy_loop_end);
3575 #endif /* CONFIG_NET_RX_BUSY_POLL */