Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[sfrench/cifs-2.6.git] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when 
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
37  *                                      for decreased register pressure on x86 
38  *                                      and more readibility. 
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/highmem.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/xfrm.h>
73 #include <linux/skbuff.h>
74 #include <net/sock.h>
75 #include <net/arp.h>
76 #include <net/icmp.h>
77 #include <net/checksum.h>
78 #include <net/inetpeer.h>
79 #include <net/checksum.h>
80 #include <linux/igmp.h>
81 #include <linux/netfilter_ipv4.h>
82 #include <linux/netfilter_bridge.h>
83 #include <linux/mroute.h>
84 #include <linux/netlink.h>
85 #include <linux/tcp.h>
86
87 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
88
89 /* Generate a checksum for an outgoing IP datagram. */
90 __inline__ void ip_send_check(struct iphdr *iph)
91 {
92         iph->check = 0;
93         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
94 }
95
96 /* dev_loopback_xmit for use with netfilter. */
97 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
98 {
99         newskb->mac.raw = newskb->data;
100         __skb_pull(newskb, newskb->nh.raw - newskb->data);
101         newskb->pkt_type = PACKET_LOOPBACK;
102         newskb->ip_summed = CHECKSUM_UNNECESSARY;
103         BUG_TRAP(newskb->dst);
104         netif_rx(newskb);
105         return 0;
106 }
107
108 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
109 {
110         int ttl = inet->uc_ttl;
111
112         if (ttl < 0)
113                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
114         return ttl;
115 }
116
117 /* 
118  *              Add an ip header to a skbuff and send it out.
119  *
120  */
121 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
122                           __be32 saddr, __be32 daddr, struct ip_options *opt)
123 {
124         struct inet_sock *inet = inet_sk(sk);
125         struct rtable *rt = (struct rtable *)skb->dst;
126         struct iphdr *iph;
127
128         /* Build the IP header. */
129         if (opt)
130                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
131         else
132                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
133
134         iph->version  = 4;
135         iph->ihl      = 5;
136         iph->tos      = inet->tos;
137         if (ip_dont_fragment(sk, &rt->u.dst))
138                 iph->frag_off = htons(IP_DF);
139         else
140                 iph->frag_off = 0;
141         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
142         iph->daddr    = rt->rt_dst;
143         iph->saddr    = rt->rt_src;
144         iph->protocol = sk->sk_protocol;
145         iph->tot_len  = htons(skb->len);
146         ip_select_ident(iph, &rt->u.dst, sk);
147         skb->nh.iph   = iph;
148
149         if (opt && opt->optlen) {
150                 iph->ihl += opt->optlen>>2;
151                 ip_options_build(skb, opt, daddr, rt, 0);
152         }
153         ip_send_check(iph);
154
155         skb->priority = sk->sk_priority;
156
157         /* Send it out. */
158         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
159                        dst_output);
160 }
161
162 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
163
164 static inline int ip_finish_output2(struct sk_buff *skb)
165 {
166         struct dst_entry *dst = skb->dst;
167         struct net_device *dev = dst->dev;
168         int hh_len = LL_RESERVED_SPACE(dev);
169
170         /* Be paranoid, rather than too clever. */
171         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
172                 struct sk_buff *skb2;
173
174                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
175                 if (skb2 == NULL) {
176                         kfree_skb(skb);
177                         return -ENOMEM;
178                 }
179                 if (skb->sk)
180                         skb_set_owner_w(skb2, skb->sk);
181                 kfree_skb(skb);
182                 skb = skb2;
183         }
184
185         if (dst->hh)
186                 return neigh_hh_output(dst->hh, skb);
187         else if (dst->neighbour)
188                 return dst->neighbour->output(skb);
189
190         if (net_ratelimit())
191                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
192         kfree_skb(skb);
193         return -EINVAL;
194 }
195
196 static inline int ip_finish_output(struct sk_buff *skb)
197 {
198 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
199         /* Policy lookup after SNAT yielded a new policy */
200         if (skb->dst->xfrm != NULL) {
201                 IPCB(skb)->flags |= IPSKB_REROUTED;
202                 return dst_output(skb);
203         }
204 #endif
205         if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
206                 return ip_fragment(skb, ip_finish_output2);
207         else
208                 return ip_finish_output2(skb);
209 }
210
211 int ip_mc_output(struct sk_buff *skb)
212 {
213         struct sock *sk = skb->sk;
214         struct rtable *rt = (struct rtable*)skb->dst;
215         struct net_device *dev = rt->u.dst.dev;
216
217         /*
218          *      If the indicated interface is up and running, send the packet.
219          */
220         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
221
222         skb->dev = dev;
223         skb->protocol = htons(ETH_P_IP);
224
225         /*
226          *      Multicasts are looped back for other local users
227          */
228
229         if (rt->rt_flags&RTCF_MULTICAST) {
230                 if ((!sk || inet_sk(sk)->mc_loop)
231 #ifdef CONFIG_IP_MROUTE
232                 /* Small optimization: do not loopback not local frames,
233                    which returned after forwarding; they will be  dropped
234                    by ip_mr_input in any case.
235                    Note, that local frames are looped back to be delivered
236                    to local recipients.
237
238                    This check is duplicated in ip_mr_input at the moment.
239                  */
240                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
241 #endif
242                 ) {
243                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
244                         if (newskb)
245                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
246                                         newskb->dev, 
247                                         ip_dev_loopback_xmit);
248                 }
249
250                 /* Multicasts with ttl 0 must not go beyond the host */
251
252                 if (skb->nh.iph->ttl == 0) {
253                         kfree_skb(skb);
254                         return 0;
255                 }
256         }
257
258         if (rt->rt_flags&RTCF_BROADCAST) {
259                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
260                 if (newskb)
261                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
262                                 newskb->dev, ip_dev_loopback_xmit);
263         }
264
265         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
266                             ip_finish_output,
267                             !(IPCB(skb)->flags & IPSKB_REROUTED));
268 }
269
270 int ip_output(struct sk_buff *skb)
271 {
272         struct net_device *dev = skb->dst->dev;
273
274         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
275
276         skb->dev = dev;
277         skb->protocol = htons(ETH_P_IP);
278
279         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
280                             ip_finish_output,
281                             !(IPCB(skb)->flags & IPSKB_REROUTED));
282 }
283
284 int ip_queue_xmit(struct sk_buff *skb, struct sock *sk, int ipfragok)
285 {
286         struct inet_sock *inet = inet_sk(sk);
287         struct ip_options *opt = inet->opt;
288         struct rtable *rt;
289         struct iphdr *iph;
290
291         /* Skip all of this if the packet is already routed,
292          * f.e. by something like SCTP.
293          */
294         rt = (struct rtable *) skb->dst;
295         if (rt != NULL)
296                 goto packet_routed;
297
298         /* Make sure we can route this packet. */
299         rt = (struct rtable *)__sk_dst_check(sk, 0);
300         if (rt == NULL) {
301                 __be32 daddr;
302
303                 /* Use correct destination address if we have options. */
304                 daddr = inet->daddr;
305                 if(opt && opt->srr)
306                         daddr = opt->faddr;
307
308                 {
309                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
310                                             .nl_u = { .ip4_u =
311                                                       { .daddr = daddr,
312                                                         .saddr = inet->saddr,
313                                                         .tos = RT_CONN_FLAGS(sk) } },
314                                             .proto = sk->sk_protocol,
315                                             .uli_u = { .ports =
316                                                        { .sport = inet->sport,
317                                                          .dport = inet->dport } } };
318
319                         /* If this fails, retransmit mechanism of transport layer will
320                          * keep trying until route appears or the connection times
321                          * itself out.
322                          */
323                         security_sk_classify_flow(sk, &fl);
324                         if (ip_route_output_flow(&rt, &fl, sk, 0))
325                                 goto no_route;
326                 }
327                 sk_setup_caps(sk, &rt->u.dst);
328         }
329         skb->dst = dst_clone(&rt->u.dst);
330
331 packet_routed:
332         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
333                 goto no_route;
334
335         /* OK, we know where to send it, allocate and build IP header. */
336         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
337         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
338         iph->tot_len = htons(skb->len);
339         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
340                 iph->frag_off = htons(IP_DF);
341         else
342                 iph->frag_off = 0;
343         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
344         iph->protocol = sk->sk_protocol;
345         iph->saddr    = rt->rt_src;
346         iph->daddr    = rt->rt_dst;
347         skb->nh.iph   = iph;
348         /* Transport layer set skb->h.foo itself. */
349
350         if (opt && opt->optlen) {
351                 iph->ihl += opt->optlen >> 2;
352                 ip_options_build(skb, opt, inet->daddr, rt, 0);
353         }
354
355         ip_select_ident_more(iph, &rt->u.dst, sk,
356                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
357
358         /* Add an IP checksum. */
359         ip_send_check(iph);
360
361         skb->priority = sk->sk_priority;
362
363         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
364                        dst_output);
365
366 no_route:
367         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
368         kfree_skb(skb);
369         return -EHOSTUNREACH;
370 }
371
372
373 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
374 {
375         to->pkt_type = from->pkt_type;
376         to->priority = from->priority;
377         to->protocol = from->protocol;
378         dst_release(to->dst);
379         to->dst = dst_clone(from->dst);
380         to->dev = from->dev;
381         to->mark = from->mark;
382
383         /* Copy the flags to each fragment. */
384         IPCB(to)->flags = IPCB(from)->flags;
385
386 #ifdef CONFIG_NET_SCHED
387         to->tc_index = from->tc_index;
388 #endif
389 #ifdef CONFIG_NETFILTER
390         /* Connection association is same as pre-frag packet */
391         nf_conntrack_put(to->nfct);
392         to->nfct = from->nfct;
393         nf_conntrack_get(to->nfct);
394         to->nfctinfo = from->nfctinfo;
395 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
396         to->ipvs_property = from->ipvs_property;
397 #endif
398 #ifdef CONFIG_BRIDGE_NETFILTER
399         nf_bridge_put(to->nf_bridge);
400         to->nf_bridge = from->nf_bridge;
401         nf_bridge_get(to->nf_bridge);
402 #endif
403 #endif
404         skb_copy_secmark(to, from);
405 }
406
407 /*
408  *      This IP datagram is too large to be sent in one piece.  Break it up into
409  *      smaller pieces (each of size equal to IP header plus
410  *      a block of the data of the original IP data part) that will yet fit in a
411  *      single device frame, and queue such a frame for sending.
412  */
413
414 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
415 {
416         struct iphdr *iph;
417         int raw = 0;
418         int ptr;
419         struct net_device *dev;
420         struct sk_buff *skb2;
421         unsigned int mtu, hlen, left, len, ll_rs, pad;
422         int offset;
423         __be16 not_last_frag;
424         struct rtable *rt = (struct rtable*)skb->dst;
425         int err = 0;
426
427         dev = rt->u.dst.dev;
428
429         /*
430          *      Point into the IP datagram header.
431          */
432
433         iph = skb->nh.iph;
434
435         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
436                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
437                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
438                           htonl(dst_mtu(&rt->u.dst)));
439                 kfree_skb(skb);
440                 return -EMSGSIZE;
441         }
442
443         /*
444          *      Setup starting values.
445          */
446
447         hlen = iph->ihl * 4;
448         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
449         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
450
451         /* When frag_list is given, use it. First, check its validity:
452          * some transformers could create wrong frag_list or break existing
453          * one, it is not prohibited. In this case fall back to copying.
454          *
455          * LATER: this step can be merged to real generation of fragments,
456          * we can switch to copy when see the first bad fragment.
457          */
458         if (skb_shinfo(skb)->frag_list) {
459                 struct sk_buff *frag;
460                 int first_len = skb_pagelen(skb);
461
462                 if (first_len - hlen > mtu ||
463                     ((first_len - hlen) & 7) ||
464                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
465                     skb_cloned(skb))
466                         goto slow_path;
467
468                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
469                         /* Correct geometry. */
470                         if (frag->len > mtu ||
471                             ((frag->len & 7) && frag->next) ||
472                             skb_headroom(frag) < hlen)
473                             goto slow_path;
474
475                         /* Partially cloned skb? */
476                         if (skb_shared(frag))
477                                 goto slow_path;
478
479                         BUG_ON(frag->sk);
480                         if (skb->sk) {
481                                 sock_hold(skb->sk);
482                                 frag->sk = skb->sk;
483                                 frag->destructor = sock_wfree;
484                                 skb->truesize -= frag->truesize;
485                         }
486                 }
487
488                 /* Everything is OK. Generate! */
489
490                 err = 0;
491                 offset = 0;
492                 frag = skb_shinfo(skb)->frag_list;
493                 skb_shinfo(skb)->frag_list = NULL;
494                 skb->data_len = first_len - skb_headlen(skb);
495                 skb->len = first_len;
496                 iph->tot_len = htons(first_len);
497                 iph->frag_off = htons(IP_MF);
498                 ip_send_check(iph);
499
500                 for (;;) {
501                         /* Prepare header of the next frame,
502                          * before previous one went down. */
503                         if (frag) {
504                                 frag->ip_summed = CHECKSUM_NONE;
505                                 frag->h.raw = frag->data;
506                                 frag->nh.raw = __skb_push(frag, hlen);
507                                 memcpy(frag->nh.raw, iph, hlen);
508                                 iph = frag->nh.iph;
509                                 iph->tot_len = htons(frag->len);
510                                 ip_copy_metadata(frag, skb);
511                                 if (offset == 0)
512                                         ip_options_fragment(frag);
513                                 offset += skb->len - hlen;
514                                 iph->frag_off = htons(offset>>3);
515                                 if (frag->next != NULL)
516                                         iph->frag_off |= htons(IP_MF);
517                                 /* Ready, complete checksum */
518                                 ip_send_check(iph);
519                         }
520
521                         err = output(skb);
522
523                         if (!err)
524                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
525                         if (err || !frag)
526                                 break;
527
528                         skb = frag;
529                         frag = skb->next;
530                         skb->next = NULL;
531                 }
532
533                 if (err == 0) {
534                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
535                         return 0;
536                 }
537
538                 while (frag) {
539                         skb = frag->next;
540                         kfree_skb(frag);
541                         frag = skb;
542                 }
543                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
544                 return err;
545         }
546
547 slow_path:
548         left = skb->len - hlen;         /* Space per frame */
549         ptr = raw + hlen;               /* Where to start from */
550
551         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
552          * we need to make room for the encapsulating header
553          */
554         pad = nf_bridge_pad(skb);
555         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
556         mtu -= pad;
557
558         /*
559          *      Fragment the datagram.
560          */
561
562         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
563         not_last_frag = iph->frag_off & htons(IP_MF);
564
565         /*
566          *      Keep copying data until we run out.
567          */
568
569         while(left > 0) {
570                 len = left;
571                 /* IF: it doesn't fit, use 'mtu' - the data space left */
572                 if (len > mtu)
573                         len = mtu;
574                 /* IF: we are not sending upto and including the packet end
575                    then align the next start on an eight byte boundary */
576                 if (len < left) {
577                         len &= ~7;
578                 }
579                 /*
580                  *      Allocate buffer.
581                  */
582
583                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
584                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
585                         err = -ENOMEM;
586                         goto fail;
587                 }
588
589                 /*
590                  *      Set up data on packet
591                  */
592
593                 ip_copy_metadata(skb2, skb);
594                 skb_reserve(skb2, ll_rs);
595                 skb_put(skb2, len + hlen);
596                 skb2->nh.raw = skb2->data;
597                 skb2->h.raw = skb2->data + hlen;
598
599                 /*
600                  *      Charge the memory for the fragment to any owner
601                  *      it might possess
602                  */
603
604                 if (skb->sk)
605                         skb_set_owner_w(skb2, skb->sk);
606
607                 /*
608                  *      Copy the packet header into the new buffer.
609                  */
610
611                 memcpy(skb2->nh.raw, skb->data, hlen);
612
613                 /*
614                  *      Copy a block of the IP datagram.
615                  */
616                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
617                         BUG();
618                 left -= len;
619
620                 /*
621                  *      Fill in the new header fields.
622                  */
623                 iph = skb2->nh.iph;
624                 iph->frag_off = htons((offset >> 3));
625
626                 /* ANK: dirty, but effective trick. Upgrade options only if
627                  * the segment to be fragmented was THE FIRST (otherwise,
628                  * options are already fixed) and make it ONCE
629                  * on the initial skb, so that all the following fragments
630                  * will inherit fixed options.
631                  */
632                 if (offset == 0)
633                         ip_options_fragment(skb);
634
635                 /*
636                  *      Added AC : If we are fragmenting a fragment that's not the
637                  *                 last fragment then keep MF on each bit
638                  */
639                 if (left > 0 || not_last_frag)
640                         iph->frag_off |= htons(IP_MF);
641                 ptr += len;
642                 offset += len;
643
644                 /*
645                  *      Put this fragment into the sending queue.
646                  */
647                 iph->tot_len = htons(len + hlen);
648
649                 ip_send_check(iph);
650
651                 err = output(skb2);
652                 if (err)
653                         goto fail;
654
655                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
656         }
657         kfree_skb(skb);
658         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
659         return err;
660
661 fail:
662         kfree_skb(skb); 
663         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
664         return err;
665 }
666
667 EXPORT_SYMBOL(ip_fragment);
668
669 int
670 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
671 {
672         struct iovec *iov = from;
673
674         if (skb->ip_summed == CHECKSUM_PARTIAL) {
675                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
676                         return -EFAULT;
677         } else {
678                 __wsum csum = 0;
679                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
680                         return -EFAULT;
681                 skb->csum = csum_block_add(skb->csum, csum, odd);
682         }
683         return 0;
684 }
685
686 static inline __wsum
687 csum_page(struct page *page, int offset, int copy)
688 {
689         char *kaddr;
690         __wsum csum;
691         kaddr = kmap(page);
692         csum = csum_partial(kaddr + offset, copy, 0);
693         kunmap(page);
694         return csum;
695 }
696
697 static inline int ip_ufo_append_data(struct sock *sk,
698                         int getfrag(void *from, char *to, int offset, int len,
699                                int odd, struct sk_buff *skb),
700                         void *from, int length, int hh_len, int fragheaderlen,
701                         int transhdrlen, int mtu,unsigned int flags)
702 {
703         struct sk_buff *skb;
704         int err;
705
706         /* There is support for UDP fragmentation offload by network
707          * device, so create one single skb packet containing complete
708          * udp datagram
709          */
710         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
711                 skb = sock_alloc_send_skb(sk,
712                         hh_len + fragheaderlen + transhdrlen + 20,
713                         (flags & MSG_DONTWAIT), &err);
714
715                 if (skb == NULL)
716                         return err;
717
718                 /* reserve space for Hardware header */
719                 skb_reserve(skb, hh_len);
720
721                 /* create space for UDP/IP header */
722                 skb_put(skb,fragheaderlen + transhdrlen);
723
724                 /* initialize network header pointer */
725                 skb->nh.raw = skb->data;
726
727                 /* initialize protocol header pointer */
728                 skb->h.raw = skb->data + fragheaderlen;
729
730                 skb->ip_summed = CHECKSUM_PARTIAL;
731                 skb->csum = 0;
732                 sk->sk_sndmsg_off = 0;
733         }
734
735         err = skb_append_datato_frags(sk,skb, getfrag, from,
736                                (length - transhdrlen));
737         if (!err) {
738                 /* specify the length of each IP datagram fragment*/
739                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
740                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
741                 __skb_queue_tail(&sk->sk_write_queue, skb);
742
743                 return 0;
744         }
745         /* There is not enough support do UFO ,
746          * so follow normal path
747          */
748         kfree_skb(skb);
749         return err;
750 }
751
752 /*
753  *      ip_append_data() and ip_append_page() can make one large IP datagram
754  *      from many pieces of data. Each pieces will be holded on the socket
755  *      until ip_push_pending_frames() is called. Each piece can be a page
756  *      or non-page data.
757  *      
758  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
759  *      this interface potentially.
760  *
761  *      LATER: length must be adjusted by pad at tail, when it is required.
762  */
763 int ip_append_data(struct sock *sk,
764                    int getfrag(void *from, char *to, int offset, int len,
765                                int odd, struct sk_buff *skb),
766                    void *from, int length, int transhdrlen,
767                    struct ipcm_cookie *ipc, struct rtable *rt,
768                    unsigned int flags)
769 {
770         struct inet_sock *inet = inet_sk(sk);
771         struct sk_buff *skb;
772
773         struct ip_options *opt = NULL;
774         int hh_len;
775         int exthdrlen;
776         int mtu;
777         int copy;
778         int err;
779         int offset = 0;
780         unsigned int maxfraglen, fragheaderlen;
781         int csummode = CHECKSUM_NONE;
782
783         if (flags&MSG_PROBE)
784                 return 0;
785
786         if (skb_queue_empty(&sk->sk_write_queue)) {
787                 /*
788                  * setup for corking.
789                  */
790                 opt = ipc->opt;
791                 if (opt) {
792                         if (inet->cork.opt == NULL) {
793                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
794                                 if (unlikely(inet->cork.opt == NULL))
795                                         return -ENOBUFS;
796                         }
797                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
798                         inet->cork.flags |= IPCORK_OPT;
799                         inet->cork.addr = ipc->addr;
800                 }
801                 dst_hold(&rt->u.dst);
802                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
803                 inet->cork.rt = rt;
804                 inet->cork.length = 0;
805                 sk->sk_sndmsg_page = NULL;
806                 sk->sk_sndmsg_off = 0;
807                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
808                         length += exthdrlen;
809                         transhdrlen += exthdrlen;
810                 }
811         } else {
812                 rt = inet->cork.rt;
813                 if (inet->cork.flags & IPCORK_OPT)
814                         opt = inet->cork.opt;
815
816                 transhdrlen = 0;
817                 exthdrlen = 0;
818                 mtu = inet->cork.fragsize;
819         }
820         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
821
822         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
823         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
824
825         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
826                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
827                 return -EMSGSIZE;
828         }
829
830         /*
831          * transhdrlen > 0 means that this is the first fragment and we wish
832          * it won't be fragmented in the future.
833          */
834         if (transhdrlen &&
835             length + fragheaderlen <= mtu &&
836             rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
837             !exthdrlen)
838                 csummode = CHECKSUM_PARTIAL;
839
840         inet->cork.length += length;
841         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
842                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
843
844                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
845                                          fragheaderlen, transhdrlen, mtu,
846                                          flags);
847                 if (err)
848                         goto error;
849                 return 0;
850         }
851
852         /* So, what's going on in the loop below?
853          *
854          * We use calculated fragment length to generate chained skb,
855          * each of segments is IP fragment ready for sending to network after
856          * adding appropriate IP header.
857          */
858
859         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
860                 goto alloc_new_skb;
861
862         while (length > 0) {
863                 /* Check if the remaining data fits into current packet. */
864                 copy = mtu - skb->len;
865                 if (copy < length)
866                         copy = maxfraglen - skb->len;
867                 if (copy <= 0) {
868                         char *data;
869                         unsigned int datalen;
870                         unsigned int fraglen;
871                         unsigned int fraggap;
872                         unsigned int alloclen;
873                         struct sk_buff *skb_prev;
874 alloc_new_skb:
875                         skb_prev = skb;
876                         if (skb_prev)
877                                 fraggap = skb_prev->len - maxfraglen;
878                         else
879                                 fraggap = 0;
880
881                         /*
882                          * If remaining data exceeds the mtu,
883                          * we know we need more fragment(s).
884                          */
885                         datalen = length + fraggap;
886                         if (datalen > mtu - fragheaderlen)
887                                 datalen = maxfraglen - fragheaderlen;
888                         fraglen = datalen + fragheaderlen;
889
890                         if ((flags & MSG_MORE) && 
891                             !(rt->u.dst.dev->features&NETIF_F_SG))
892                                 alloclen = mtu;
893                         else
894                                 alloclen = datalen + fragheaderlen;
895
896                         /* The last fragment gets additional space at tail.
897                          * Note, with MSG_MORE we overallocate on fragments,
898                          * because we have no idea what fragment will be
899                          * the last.
900                          */
901                         if (datalen == length + fraggap)
902                                 alloclen += rt->u.dst.trailer_len;
903
904                         if (transhdrlen) {
905                                 skb = sock_alloc_send_skb(sk, 
906                                                 alloclen + hh_len + 15,
907                                                 (flags & MSG_DONTWAIT), &err);
908                         } else {
909                                 skb = NULL;
910                                 if (atomic_read(&sk->sk_wmem_alloc) <=
911                                     2 * sk->sk_sndbuf)
912                                         skb = sock_wmalloc(sk, 
913                                                            alloclen + hh_len + 15, 1,
914                                                            sk->sk_allocation);
915                                 if (unlikely(skb == NULL))
916                                         err = -ENOBUFS;
917                         }
918                         if (skb == NULL)
919                                 goto error;
920
921                         /*
922                          *      Fill in the control structures
923                          */
924                         skb->ip_summed = csummode;
925                         skb->csum = 0;
926                         skb_reserve(skb, hh_len);
927
928                         /*
929                          *      Find where to start putting bytes.
930                          */
931                         data = skb_put(skb, fraglen);
932                         skb->nh.raw = data + exthdrlen;
933                         data += fragheaderlen;
934                         skb->h.raw = data + exthdrlen;
935
936                         if (fraggap) {
937                                 skb->csum = skb_copy_and_csum_bits(
938                                         skb_prev, maxfraglen,
939                                         data + transhdrlen, fraggap, 0);
940                                 skb_prev->csum = csum_sub(skb_prev->csum,
941                                                           skb->csum);
942                                 data += fraggap;
943                                 pskb_trim_unique(skb_prev, maxfraglen);
944                         }
945
946                         copy = datalen - transhdrlen - fraggap;
947                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
948                                 err = -EFAULT;
949                                 kfree_skb(skb);
950                                 goto error;
951                         }
952
953                         offset += copy;
954                         length -= datalen - fraggap;
955                         transhdrlen = 0;
956                         exthdrlen = 0;
957                         csummode = CHECKSUM_NONE;
958
959                         /*
960                          * Put the packet on the pending queue.
961                          */
962                         __skb_queue_tail(&sk->sk_write_queue, skb);
963                         continue;
964                 }
965
966                 if (copy > length)
967                         copy = length;
968
969                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
970                         unsigned int off;
971
972                         off = skb->len;
973                         if (getfrag(from, skb_put(skb, copy), 
974                                         offset, copy, off, skb) < 0) {
975                                 __skb_trim(skb, off);
976                                 err = -EFAULT;
977                                 goto error;
978                         }
979                 } else {
980                         int i = skb_shinfo(skb)->nr_frags;
981                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
982                         struct page *page = sk->sk_sndmsg_page;
983                         int off = sk->sk_sndmsg_off;
984                         unsigned int left;
985
986                         if (page && (left = PAGE_SIZE - off) > 0) {
987                                 if (copy >= left)
988                                         copy = left;
989                                 if (page != frag->page) {
990                                         if (i == MAX_SKB_FRAGS) {
991                                                 err = -EMSGSIZE;
992                                                 goto error;
993                                         }
994                                         get_page(page);
995                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
996                                         frag = &skb_shinfo(skb)->frags[i];
997                                 }
998                         } else if (i < MAX_SKB_FRAGS) {
999                                 if (copy > PAGE_SIZE)
1000                                         copy = PAGE_SIZE;
1001                                 page = alloc_pages(sk->sk_allocation, 0);
1002                                 if (page == NULL)  {
1003                                         err = -ENOMEM;
1004                                         goto error;
1005                                 }
1006                                 sk->sk_sndmsg_page = page;
1007                                 sk->sk_sndmsg_off = 0;
1008
1009                                 skb_fill_page_desc(skb, i, page, 0, 0);
1010                                 frag = &skb_shinfo(skb)->frags[i];
1011                                 skb->truesize += PAGE_SIZE;
1012                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1013                         } else {
1014                                 err = -EMSGSIZE;
1015                                 goto error;
1016                         }
1017                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1018                                 err = -EFAULT;
1019                                 goto error;
1020                         }
1021                         sk->sk_sndmsg_off += copy;
1022                         frag->size += copy;
1023                         skb->len += copy;
1024                         skb->data_len += copy;
1025                 }
1026                 offset += copy;
1027                 length -= copy;
1028         }
1029
1030         return 0;
1031
1032 error:
1033         inet->cork.length -= length;
1034         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1035         return err; 
1036 }
1037
1038 ssize_t ip_append_page(struct sock *sk, struct page *page,
1039                        int offset, size_t size, int flags)
1040 {
1041         struct inet_sock *inet = inet_sk(sk);
1042         struct sk_buff *skb;
1043         struct rtable *rt;
1044         struct ip_options *opt = NULL;
1045         int hh_len;
1046         int mtu;
1047         int len;
1048         int err;
1049         unsigned int maxfraglen, fragheaderlen, fraggap;
1050
1051         if (inet->hdrincl)
1052                 return -EPERM;
1053
1054         if (flags&MSG_PROBE)
1055                 return 0;
1056
1057         if (skb_queue_empty(&sk->sk_write_queue))
1058                 return -EINVAL;
1059
1060         rt = inet->cork.rt;
1061         if (inet->cork.flags & IPCORK_OPT)
1062                 opt = inet->cork.opt;
1063
1064         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1065                 return -EOPNOTSUPP;
1066
1067         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1068         mtu = inet->cork.fragsize;
1069
1070         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1071         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1072
1073         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1074                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1075                 return -EMSGSIZE;
1076         }
1077
1078         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1079                 return -EINVAL;
1080
1081         inet->cork.length += size;
1082         if ((sk->sk_protocol == IPPROTO_UDP) &&
1083             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1084                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1085                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1086         }
1087
1088
1089         while (size > 0) {
1090                 int i;
1091
1092                 if (skb_is_gso(skb))
1093                         len = size;
1094                 else {
1095
1096                         /* Check if the remaining data fits into current packet. */
1097                         len = mtu - skb->len;
1098                         if (len < size)
1099                                 len = maxfraglen - skb->len;
1100                 }
1101                 if (len <= 0) {
1102                         struct sk_buff *skb_prev;
1103                         char *data;
1104                         struct iphdr *iph;
1105                         int alloclen;
1106
1107                         skb_prev = skb;
1108                         fraggap = skb_prev->len - maxfraglen;
1109
1110                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1111                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1112                         if (unlikely(!skb)) {
1113                                 err = -ENOBUFS;
1114                                 goto error;
1115                         }
1116
1117                         /*
1118                          *      Fill in the control structures
1119                          */
1120                         skb->ip_summed = CHECKSUM_NONE;
1121                         skb->csum = 0;
1122                         skb_reserve(skb, hh_len);
1123
1124                         /*
1125                          *      Find where to start putting bytes.
1126                          */
1127                         data = skb_put(skb, fragheaderlen + fraggap);
1128                         skb->nh.iph = iph = (struct iphdr *)data;
1129                         data += fragheaderlen;
1130                         skb->h.raw = data;
1131
1132                         if (fraggap) {
1133                                 skb->csum = skb_copy_and_csum_bits(
1134                                         skb_prev, maxfraglen,
1135                                         data, fraggap, 0);
1136                                 skb_prev->csum = csum_sub(skb_prev->csum,
1137                                                           skb->csum);
1138                                 pskb_trim_unique(skb_prev, maxfraglen);
1139                         }
1140
1141                         /*
1142                          * Put the packet on the pending queue.
1143                          */
1144                         __skb_queue_tail(&sk->sk_write_queue, skb);
1145                         continue;
1146                 }
1147
1148                 i = skb_shinfo(skb)->nr_frags;
1149                 if (len > size)
1150                         len = size;
1151                 if (skb_can_coalesce(skb, i, page, offset)) {
1152                         skb_shinfo(skb)->frags[i-1].size += len;
1153                 } else if (i < MAX_SKB_FRAGS) {
1154                         get_page(page);
1155                         skb_fill_page_desc(skb, i, page, offset, len);
1156                 } else {
1157                         err = -EMSGSIZE;
1158                         goto error;
1159                 }
1160
1161                 if (skb->ip_summed == CHECKSUM_NONE) {
1162                         __wsum csum;
1163                         csum = csum_page(page, offset, len);
1164                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1165                 }
1166
1167                 skb->len += len;
1168                 skb->data_len += len;
1169                 offset += len;
1170                 size -= len;
1171         }
1172         return 0;
1173
1174 error:
1175         inet->cork.length -= size;
1176         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1177         return err;
1178 }
1179
1180 /*
1181  *      Combined all pending IP fragments on the socket as one IP datagram
1182  *      and push them out.
1183  */
1184 int ip_push_pending_frames(struct sock *sk)
1185 {
1186         struct sk_buff *skb, *tmp_skb;
1187         struct sk_buff **tail_skb;
1188         struct inet_sock *inet = inet_sk(sk);
1189         struct ip_options *opt = NULL;
1190         struct rtable *rt = inet->cork.rt;
1191         struct iphdr *iph;
1192         __be16 df = 0;
1193         __u8 ttl;
1194         int err = 0;
1195
1196         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1197                 goto out;
1198         tail_skb = &(skb_shinfo(skb)->frag_list);
1199
1200         /* move skb->data to ip header from ext header */
1201         if (skb->data < skb->nh.raw)
1202                 __skb_pull(skb, skb->nh.raw - skb->data);
1203         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1204                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1205                 *tail_skb = tmp_skb;
1206                 tail_skb = &(tmp_skb->next);
1207                 skb->len += tmp_skb->len;
1208                 skb->data_len += tmp_skb->len;
1209                 skb->truesize += tmp_skb->truesize;
1210                 __sock_put(tmp_skb->sk);
1211                 tmp_skb->destructor = NULL;
1212                 tmp_skb->sk = NULL;
1213         }
1214
1215         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1216          * to fragment the frame generated here. No matter, what transforms
1217          * how transforms change size of the packet, it will come out.
1218          */
1219         if (inet->pmtudisc != IP_PMTUDISC_DO)
1220                 skb->local_df = 1;
1221
1222         /* DF bit is set when we want to see DF on outgoing frames.
1223          * If local_df is set too, we still allow to fragment this frame
1224          * locally. */
1225         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1226             (skb->len <= dst_mtu(&rt->u.dst) &&
1227              ip_dont_fragment(sk, &rt->u.dst)))
1228                 df = htons(IP_DF);
1229
1230         if (inet->cork.flags & IPCORK_OPT)
1231                 opt = inet->cork.opt;
1232
1233         if (rt->rt_type == RTN_MULTICAST)
1234                 ttl = inet->mc_ttl;
1235         else
1236                 ttl = ip_select_ttl(inet, &rt->u.dst);
1237
1238         iph = (struct iphdr *)skb->data;
1239         iph->version = 4;
1240         iph->ihl = 5;
1241         if (opt) {
1242                 iph->ihl += opt->optlen>>2;
1243                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1244         }
1245         iph->tos = inet->tos;
1246         iph->tot_len = htons(skb->len);
1247         iph->frag_off = df;
1248         ip_select_ident(iph, &rt->u.dst, sk);
1249         iph->ttl = ttl;
1250         iph->protocol = sk->sk_protocol;
1251         iph->saddr = rt->rt_src;
1252         iph->daddr = rt->rt_dst;
1253         ip_send_check(iph);
1254
1255         skb->priority = sk->sk_priority;
1256         skb->dst = dst_clone(&rt->u.dst);
1257
1258         /* Netfilter gets whole the not fragmented skb. */
1259         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
1260                       skb->dst->dev, dst_output);
1261         if (err) {
1262                 if (err > 0)
1263                         err = inet->recverr ? net_xmit_errno(err) : 0;
1264                 if (err)
1265                         goto error;
1266         }
1267
1268 out:
1269         inet->cork.flags &= ~IPCORK_OPT;
1270         kfree(inet->cork.opt);
1271         inet->cork.opt = NULL;
1272         if (inet->cork.rt) {
1273                 ip_rt_put(inet->cork.rt);
1274                 inet->cork.rt = NULL;
1275         }
1276         return err;
1277
1278 error:
1279         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1280         goto out;
1281 }
1282
1283 /*
1284  *      Throw away all pending data on the socket.
1285  */
1286 void ip_flush_pending_frames(struct sock *sk)
1287 {
1288         struct inet_sock *inet = inet_sk(sk);
1289         struct sk_buff *skb;
1290
1291         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1292                 kfree_skb(skb);
1293
1294         inet->cork.flags &= ~IPCORK_OPT;
1295         kfree(inet->cork.opt);
1296         inet->cork.opt = NULL;
1297         if (inet->cork.rt) {
1298                 ip_rt_put(inet->cork.rt);
1299                 inet->cork.rt = NULL;
1300         }
1301 }
1302
1303
1304 /*
1305  *      Fetch data from kernel space and fill in checksum if needed.
1306  */
1307 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
1308                               int len, int odd, struct sk_buff *skb)
1309 {
1310         __wsum csum;
1311
1312         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1313         skb->csum = csum_block_add(skb->csum, csum, odd);
1314         return 0;  
1315 }
1316
1317 /* 
1318  *      Generic function to send a packet as reply to another packet.
1319  *      Used to send TCP resets so far. ICMP should use this function too.
1320  *
1321  *      Should run single threaded per socket because it uses the sock 
1322  *      structure to pass arguments.
1323  *
1324  *      LATER: switch from ip_build_xmit to ip_append_*
1325  */
1326 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1327                    unsigned int len)
1328 {
1329         struct inet_sock *inet = inet_sk(sk);
1330         struct {
1331                 struct ip_options       opt;
1332                 char                    data[40];
1333         } replyopts;
1334         struct ipcm_cookie ipc;
1335         __be32 daddr;
1336         struct rtable *rt = (struct rtable*)skb->dst;
1337
1338         if (ip_options_echo(&replyopts.opt, skb))
1339                 return;
1340
1341         daddr = ipc.addr = rt->rt_src;
1342         ipc.opt = NULL;
1343
1344         if (replyopts.opt.optlen) {
1345                 ipc.opt = &replyopts.opt;
1346
1347                 if (ipc.opt->srr)
1348                         daddr = replyopts.opt.faddr;
1349         }
1350
1351         {
1352                 struct flowi fl = { .nl_u = { .ip4_u =
1353                                               { .daddr = daddr,
1354                                                 .saddr = rt->rt_spec_dst,
1355                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1356                                     /* Not quite clean, but right. */
1357                                     .uli_u = { .ports =
1358                                                { .sport = skb->h.th->dest,
1359                                                  .dport = skb->h.th->source } },
1360                                     .proto = sk->sk_protocol };
1361                 security_skb_classify_flow(skb, &fl);
1362                 if (ip_route_output_key(&rt, &fl))
1363                         return;
1364         }
1365
1366         /* And let IP do all the hard work.
1367
1368            This chunk is not reenterable, hence spinlock.
1369            Note that it uses the fact, that this function is called
1370            with locally disabled BH and that sk cannot be already spinlocked.
1371          */
1372         bh_lock_sock(sk);
1373         inet->tos = skb->nh.iph->tos;
1374         sk->sk_priority = skb->priority;
1375         sk->sk_protocol = skb->nh.iph->protocol;
1376         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1377                        &ipc, rt, MSG_DONTWAIT);
1378         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1379                 if (arg->csumoffset >= 0)
1380                         *((__sum16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1381                 skb->ip_summed = CHECKSUM_NONE;
1382                 ip_push_pending_frames(sk);
1383         }
1384
1385         bh_unlock_sock(sk);
1386
1387         ip_rt_put(rt);
1388 }
1389
1390 void __init ip_init(void)
1391 {
1392         ip_rt_init();
1393         inet_initpeers();
1394
1395 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1396         igmp_mc_proc_init();
1397 #endif
1398 }
1399
1400 EXPORT_SYMBOL(ip_generic_getfrag);
1401 EXPORT_SYMBOL(ip_queue_xmit);
1402 EXPORT_SYMBOL(ip_send_check);