fed3f1c6616708997f621535efe9412e4afa0a50
[muen/linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87
88 #include <trace/events/tcp.h>
89
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100         return secure_tcp_seq(ip_hdr(skb)->daddr,
101                               ip_hdr(skb)->saddr,
102                               tcp_hdr(skb)->dest,
103                               tcp_hdr(skb)->source);
104 }
105
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117
118         if (reuse == 2) {
119                 /* Still does not detect *everything* that goes through
120                  * lo, since we require a loopback src or dst address
121                  * or direct binding to 'lo' interface.
122                  */
123                 bool loopback = false;
124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125                         loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127                 if (tw->tw_family == AF_INET6) {
128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134                                 loopback = true;
135                 } else
136 #endif
137                 {
138                         if (ipv4_is_loopback(tw->tw_daddr) ||
139                             ipv4_is_loopback(tw->tw_rcv_saddr))
140                                 loopback = true;
141                 }
142                 if (!loopback)
143                         reuse = 0;
144         }
145
146         /* With PAWS, it is safe from the viewpoint
147            of data integrity. Even without PAWS it is safe provided sequence
148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149
150            Actually, the idea is close to VJ's one, only timestamp cache is
151            held not per host, but per port pair and TW bucket is used as state
152            holder.
153
154            If TW bucket has been already destroyed we fall back to VJ's scheme
155            and use initial timestamp retrieved from peer table.
156          */
157         if (tcptw->tw_ts_recent_stamp &&
158             (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
159                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
160                 if (tp->write_seq == 0)
161                         tp->write_seq = 1;
162                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
163                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
164                 sock_hold(sktw);
165                 return 1;
166         }
167
168         return 0;
169 }
170 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
171
172 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
173                               int addr_len)
174 {
175         /* This check is replicated from tcp_v4_connect() and intended to
176          * prevent BPF program called below from accessing bytes that are out
177          * of the bound specified by user in addr_len.
178          */
179         if (addr_len < sizeof(struct sockaddr_in))
180                 return -EINVAL;
181
182         sock_owned_by_me(sk);
183
184         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
185 }
186
187 /* This will initiate an outgoing connection. */
188 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
189 {
190         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
191         struct inet_sock *inet = inet_sk(sk);
192         struct tcp_sock *tp = tcp_sk(sk);
193         __be16 orig_sport, orig_dport;
194         __be32 daddr, nexthop;
195         struct flowi4 *fl4;
196         struct rtable *rt;
197         int err;
198         struct ip_options_rcu *inet_opt;
199         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
200
201         if (addr_len < sizeof(struct sockaddr_in))
202                 return -EINVAL;
203
204         if (usin->sin_family != AF_INET)
205                 return -EAFNOSUPPORT;
206
207         nexthop = daddr = usin->sin_addr.s_addr;
208         inet_opt = rcu_dereference_protected(inet->inet_opt,
209                                              lockdep_sock_is_held(sk));
210         if (inet_opt && inet_opt->opt.srr) {
211                 if (!daddr)
212                         return -EINVAL;
213                 nexthop = inet_opt->opt.faddr;
214         }
215
216         orig_sport = inet->inet_sport;
217         orig_dport = usin->sin_port;
218         fl4 = &inet->cork.fl.u.ip4;
219         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
220                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
221                               IPPROTO_TCP,
222                               orig_sport, orig_dport, sk);
223         if (IS_ERR(rt)) {
224                 err = PTR_ERR(rt);
225                 if (err == -ENETUNREACH)
226                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
227                 return err;
228         }
229
230         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
231                 ip_rt_put(rt);
232                 return -ENETUNREACH;
233         }
234
235         if (!inet_opt || !inet_opt->opt.srr)
236                 daddr = fl4->daddr;
237
238         if (!inet->inet_saddr)
239                 inet->inet_saddr = fl4->saddr;
240         sk_rcv_saddr_set(sk, inet->inet_saddr);
241
242         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
243                 /* Reset inherited state */
244                 tp->rx_opt.ts_recent       = 0;
245                 tp->rx_opt.ts_recent_stamp = 0;
246                 if (likely(!tp->repair))
247                         tp->write_seq      = 0;
248         }
249
250         inet->inet_dport = usin->sin_port;
251         sk_daddr_set(sk, daddr);
252
253         inet_csk(sk)->icsk_ext_hdr_len = 0;
254         if (inet_opt)
255                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
256
257         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
258
259         /* Socket identity is still unknown (sport may be zero).
260          * However we set state to SYN-SENT and not releasing socket
261          * lock select source port, enter ourselves into the hash tables and
262          * complete initialization after this.
263          */
264         tcp_set_state(sk, TCP_SYN_SENT);
265         err = inet_hash_connect(tcp_death_row, sk);
266         if (err)
267                 goto failure;
268
269         sk_set_txhash(sk);
270
271         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
272                                inet->inet_sport, inet->inet_dport, sk);
273         if (IS_ERR(rt)) {
274                 err = PTR_ERR(rt);
275                 rt = NULL;
276                 goto failure;
277         }
278         /* OK, now commit destination to socket.  */
279         sk->sk_gso_type = SKB_GSO_TCPV4;
280         sk_setup_caps(sk, &rt->dst);
281         rt = NULL;
282
283         if (likely(!tp->repair)) {
284                 if (!tp->write_seq)
285                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
286                                                        inet->inet_daddr,
287                                                        inet->inet_sport,
288                                                        usin->sin_port);
289                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
290                                                  inet->inet_saddr,
291                                                  inet->inet_daddr);
292         }
293
294         inet->inet_id = tp->write_seq ^ jiffies;
295
296         if (tcp_fastopen_defer_connect(sk, &err))
297                 return err;
298         if (err)
299                 goto failure;
300
301         err = tcp_connect(sk);
302
303         if (err)
304                 goto failure;
305
306         return 0;
307
308 failure:
309         /*
310          * This unhashes the socket and releases the local port,
311          * if necessary.
312          */
313         tcp_set_state(sk, TCP_CLOSE);
314         ip_rt_put(rt);
315         sk->sk_route_caps = 0;
316         inet->inet_dport = 0;
317         return err;
318 }
319 EXPORT_SYMBOL(tcp_v4_connect);
320
321 /*
322  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
323  * It can be called through tcp_release_cb() if socket was owned by user
324  * at the time tcp_v4_err() was called to handle ICMP message.
325  */
326 void tcp_v4_mtu_reduced(struct sock *sk)
327 {
328         struct inet_sock *inet = inet_sk(sk);
329         struct dst_entry *dst;
330         u32 mtu;
331
332         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
333                 return;
334         mtu = tcp_sk(sk)->mtu_info;
335         dst = inet_csk_update_pmtu(sk, mtu);
336         if (!dst)
337                 return;
338
339         /* Something is about to be wrong... Remember soft error
340          * for the case, if this connection will not able to recover.
341          */
342         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
343                 sk->sk_err_soft = EMSGSIZE;
344
345         mtu = dst_mtu(dst);
346
347         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
348             ip_sk_accept_pmtu(sk) &&
349             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
350                 tcp_sync_mss(sk, mtu);
351
352                 /* Resend the TCP packet because it's
353                  * clear that the old packet has been
354                  * dropped. This is the new "fast" path mtu
355                  * discovery.
356                  */
357                 tcp_simple_retransmit(sk);
358         } /* else let the usual retransmit timer handle it */
359 }
360 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
361
362 static void do_redirect(struct sk_buff *skb, struct sock *sk)
363 {
364         struct dst_entry *dst = __sk_dst_check(sk, 0);
365
366         if (dst)
367                 dst->ops->redirect(dst, sk, skb);
368 }
369
370
371 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
372 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
373 {
374         struct request_sock *req = inet_reqsk(sk);
375         struct net *net = sock_net(sk);
376
377         /* ICMPs are not backlogged, hence we cannot get
378          * an established socket here.
379          */
380         if (seq != tcp_rsk(req)->snt_isn) {
381                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
382         } else if (abort) {
383                 /*
384                  * Still in SYN_RECV, just remove it silently.
385                  * There is no good way to pass the error to the newly
386                  * created socket, and POSIX does not want network
387                  * errors returned from accept().
388                  */
389                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
390                 tcp_listendrop(req->rsk_listener);
391         }
392         reqsk_put(req);
393 }
394 EXPORT_SYMBOL(tcp_req_err);
395
396 /*
397  * This routine is called by the ICMP module when it gets some
398  * sort of error condition.  If err < 0 then the socket should
399  * be closed and the error returned to the user.  If err > 0
400  * it's just the icmp type << 8 | icmp code.  After adjustment
401  * header points to the first 8 bytes of the tcp header.  We need
402  * to find the appropriate port.
403  *
404  * The locking strategy used here is very "optimistic". When
405  * someone else accesses the socket the ICMP is just dropped
406  * and for some paths there is no check at all.
407  * A more general error queue to queue errors for later handling
408  * is probably better.
409  *
410  */
411
412 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
413 {
414         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
415         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
416         struct inet_connection_sock *icsk;
417         struct tcp_sock *tp;
418         struct inet_sock *inet;
419         const int type = icmp_hdr(icmp_skb)->type;
420         const int code = icmp_hdr(icmp_skb)->code;
421         struct sock *sk;
422         struct sk_buff *skb;
423         struct request_sock *fastopen;
424         u32 seq, snd_una;
425         s32 remaining;
426         u32 delta_us;
427         int err;
428         struct net *net = dev_net(icmp_skb->dev);
429
430         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
431                                        th->dest, iph->saddr, ntohs(th->source),
432                                        inet_iif(icmp_skb), 0);
433         if (!sk) {
434                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
435                 return;
436         }
437         if (sk->sk_state == TCP_TIME_WAIT) {
438                 inet_twsk_put(inet_twsk(sk));
439                 return;
440         }
441         seq = ntohl(th->seq);
442         if (sk->sk_state == TCP_NEW_SYN_RECV)
443                 return tcp_req_err(sk, seq,
444                                   type == ICMP_PARAMETERPROB ||
445                                   type == ICMP_TIME_EXCEEDED ||
446                                   (type == ICMP_DEST_UNREACH &&
447                                    (code == ICMP_NET_UNREACH ||
448                                     code == ICMP_HOST_UNREACH)));
449
450         bh_lock_sock(sk);
451         /* If too many ICMPs get dropped on busy
452          * servers this needs to be solved differently.
453          * We do take care of PMTU discovery (RFC1191) special case :
454          * we can receive locally generated ICMP messages while socket is held.
455          */
456         if (sock_owned_by_user(sk)) {
457                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
458                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
459         }
460         if (sk->sk_state == TCP_CLOSE)
461                 goto out;
462
463         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
464                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
465                 goto out;
466         }
467
468         icsk = inet_csk(sk);
469         tp = tcp_sk(sk);
470         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
471         fastopen = tp->fastopen_rsk;
472         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
473         if (sk->sk_state != TCP_LISTEN &&
474             !between(seq, snd_una, tp->snd_nxt)) {
475                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
476                 goto out;
477         }
478
479         switch (type) {
480         case ICMP_REDIRECT:
481                 if (!sock_owned_by_user(sk))
482                         do_redirect(icmp_skb, sk);
483                 goto out;
484         case ICMP_SOURCE_QUENCH:
485                 /* Just silently ignore these. */
486                 goto out;
487         case ICMP_PARAMETERPROB:
488                 err = EPROTO;
489                 break;
490         case ICMP_DEST_UNREACH:
491                 if (code > NR_ICMP_UNREACH)
492                         goto out;
493
494                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
495                         /* We are not interested in TCP_LISTEN and open_requests
496                          * (SYN-ACKs send out by Linux are always <576bytes so
497                          * they should go through unfragmented).
498                          */
499                         if (sk->sk_state == TCP_LISTEN)
500                                 goto out;
501
502                         tp->mtu_info = info;
503                         if (!sock_owned_by_user(sk)) {
504                                 tcp_v4_mtu_reduced(sk);
505                         } else {
506                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
507                                         sock_hold(sk);
508                         }
509                         goto out;
510                 }
511
512                 err = icmp_err_convert[code].errno;
513                 /* check if icmp_skb allows revert of backoff
514                  * (see draft-zimmermann-tcp-lcd) */
515                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
516                         break;
517                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
518                     !icsk->icsk_backoff || fastopen)
519                         break;
520
521                 if (sock_owned_by_user(sk))
522                         break;
523
524                 icsk->icsk_backoff--;
525                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
526                                                TCP_TIMEOUT_INIT;
527                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
528
529                 skb = tcp_rtx_queue_head(sk);
530                 BUG_ON(!skb);
531
532                 tcp_mstamp_refresh(tp);
533                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
534                 remaining = icsk->icsk_rto -
535                             usecs_to_jiffies(delta_us);
536
537                 if (remaining > 0) {
538                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
539                                                   remaining, TCP_RTO_MAX);
540                 } else {
541                         /* RTO revert clocked out retransmission.
542                          * Will retransmit now */
543                         tcp_retransmit_timer(sk);
544                 }
545
546                 break;
547         case ICMP_TIME_EXCEEDED:
548                 err = EHOSTUNREACH;
549                 break;
550         default:
551                 goto out;
552         }
553
554         switch (sk->sk_state) {
555         case TCP_SYN_SENT:
556         case TCP_SYN_RECV:
557                 /* Only in fast or simultaneous open. If a fast open socket is
558                  * is already accepted it is treated as a connected one below.
559                  */
560                 if (fastopen && !fastopen->sk)
561                         break;
562
563                 if (!sock_owned_by_user(sk)) {
564                         sk->sk_err = err;
565
566                         sk->sk_error_report(sk);
567
568                         tcp_done(sk);
569                 } else {
570                         sk->sk_err_soft = err;
571                 }
572                 goto out;
573         }
574
575         /* If we've already connected we will keep trying
576          * until we time out, or the user gives up.
577          *
578          * rfc1122 4.2.3.9 allows to consider as hard errors
579          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
580          * but it is obsoleted by pmtu discovery).
581          *
582          * Note, that in modern internet, where routing is unreliable
583          * and in each dark corner broken firewalls sit, sending random
584          * errors ordered by their masters even this two messages finally lose
585          * their original sense (even Linux sends invalid PORT_UNREACHs)
586          *
587          * Now we are in compliance with RFCs.
588          *                                                      --ANK (980905)
589          */
590
591         inet = inet_sk(sk);
592         if (!sock_owned_by_user(sk) && inet->recverr) {
593                 sk->sk_err = err;
594                 sk->sk_error_report(sk);
595         } else  { /* Only an error on timeout */
596                 sk->sk_err_soft = err;
597         }
598
599 out:
600         bh_unlock_sock(sk);
601         sock_put(sk);
602 }
603
604 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
605 {
606         struct tcphdr *th = tcp_hdr(skb);
607
608         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
609         skb->csum_start = skb_transport_header(skb) - skb->head;
610         skb->csum_offset = offsetof(struct tcphdr, check);
611 }
612
613 /* This routine computes an IPv4 TCP checksum. */
614 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
615 {
616         const struct inet_sock *inet = inet_sk(sk);
617
618         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
619 }
620 EXPORT_SYMBOL(tcp_v4_send_check);
621
622 /*
623  *      This routine will send an RST to the other tcp.
624  *
625  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
626  *                    for reset.
627  *      Answer: if a packet caused RST, it is not for a socket
628  *              existing in our system, if it is matched to a socket,
629  *              it is just duplicate segment or bug in other side's TCP.
630  *              So that we build reply only basing on parameters
631  *              arrived with segment.
632  *      Exception: precedence violation. We do not implement it in any case.
633  */
634
635 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
636 {
637         const struct tcphdr *th = tcp_hdr(skb);
638         struct {
639                 struct tcphdr th;
640 #ifdef CONFIG_TCP_MD5SIG
641                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
642 #endif
643         } rep;
644         struct ip_reply_arg arg;
645 #ifdef CONFIG_TCP_MD5SIG
646         struct tcp_md5sig_key *key = NULL;
647         const __u8 *hash_location = NULL;
648         unsigned char newhash[16];
649         int genhash;
650         struct sock *sk1 = NULL;
651 #endif
652         struct net *net;
653         struct sock *ctl_sk;
654
655         /* Never send a reset in response to a reset. */
656         if (th->rst)
657                 return;
658
659         /* If sk not NULL, it means we did a successful lookup and incoming
660          * route had to be correct. prequeue might have dropped our dst.
661          */
662         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
663                 return;
664
665         /* Swap the send and the receive. */
666         memset(&rep, 0, sizeof(rep));
667         rep.th.dest   = th->source;
668         rep.th.source = th->dest;
669         rep.th.doff   = sizeof(struct tcphdr) / 4;
670         rep.th.rst    = 1;
671
672         if (th->ack) {
673                 rep.th.seq = th->ack_seq;
674         } else {
675                 rep.th.ack = 1;
676                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
677                                        skb->len - (th->doff << 2));
678         }
679
680         memset(&arg, 0, sizeof(arg));
681         arg.iov[0].iov_base = (unsigned char *)&rep;
682         arg.iov[0].iov_len  = sizeof(rep.th);
683
684         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
685 #ifdef CONFIG_TCP_MD5SIG
686         rcu_read_lock();
687         hash_location = tcp_parse_md5sig_option(th);
688         if (sk && sk_fullsock(sk)) {
689                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
690                                         &ip_hdr(skb)->saddr, AF_INET);
691         } else if (hash_location) {
692                 /*
693                  * active side is lost. Try to find listening socket through
694                  * source port, and then find md5 key through listening socket.
695                  * we are not loose security here:
696                  * Incoming packet is checked with md5 hash with finding key,
697                  * no RST generated if md5 hash doesn't match.
698                  */
699                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
700                                              ip_hdr(skb)->saddr,
701                                              th->source, ip_hdr(skb)->daddr,
702                                              ntohs(th->source), inet_iif(skb),
703                                              tcp_v4_sdif(skb));
704                 /* don't send rst if it can't find key */
705                 if (!sk1)
706                         goto out;
707
708                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
709                                         &ip_hdr(skb)->saddr, AF_INET);
710                 if (!key)
711                         goto out;
712
713
714                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
715                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
716                         goto out;
717
718         }
719
720         if (key) {
721                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
722                                    (TCPOPT_NOP << 16) |
723                                    (TCPOPT_MD5SIG << 8) |
724                                    TCPOLEN_MD5SIG);
725                 /* Update length and the length the header thinks exists */
726                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
727                 rep.th.doff = arg.iov[0].iov_len / 4;
728
729                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
730                                      key, ip_hdr(skb)->saddr,
731                                      ip_hdr(skb)->daddr, &rep.th);
732         }
733 #endif
734         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
735                                       ip_hdr(skb)->saddr, /* XXX */
736                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
737         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
738         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
739
740         /* When socket is gone, all binding information is lost.
741          * routing might fail in this case. No choice here, if we choose to force
742          * input interface, we will misroute in case of asymmetric route.
743          */
744         if (sk) {
745                 arg.bound_dev_if = sk->sk_bound_dev_if;
746                 if (sk_fullsock(sk))
747                         trace_tcp_send_reset(sk, skb);
748         }
749
750         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
751                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
752
753         arg.tos = ip_hdr(skb)->tos;
754         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
755         local_bh_disable();
756         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
757         if (sk)
758                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
759                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
760         ip_send_unicast_reply(ctl_sk,
761                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
762                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
763                               &arg, arg.iov[0].iov_len);
764
765         ctl_sk->sk_mark = 0;
766         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
767         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
768         local_bh_enable();
769
770 #ifdef CONFIG_TCP_MD5SIG
771 out:
772         rcu_read_unlock();
773 #endif
774 }
775
776 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
777    outside socket context is ugly, certainly. What can I do?
778  */
779
780 static void tcp_v4_send_ack(const struct sock *sk,
781                             struct sk_buff *skb, u32 seq, u32 ack,
782                             u32 win, u32 tsval, u32 tsecr, int oif,
783                             struct tcp_md5sig_key *key,
784                             int reply_flags, u8 tos)
785 {
786         const struct tcphdr *th = tcp_hdr(skb);
787         struct {
788                 struct tcphdr th;
789                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
790 #ifdef CONFIG_TCP_MD5SIG
791                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
792 #endif
793                         ];
794         } rep;
795         struct net *net = sock_net(sk);
796         struct ip_reply_arg arg;
797         struct sock *ctl_sk;
798
799         memset(&rep.th, 0, sizeof(struct tcphdr));
800         memset(&arg, 0, sizeof(arg));
801
802         arg.iov[0].iov_base = (unsigned char *)&rep;
803         arg.iov[0].iov_len  = sizeof(rep.th);
804         if (tsecr) {
805                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
806                                    (TCPOPT_TIMESTAMP << 8) |
807                                    TCPOLEN_TIMESTAMP);
808                 rep.opt[1] = htonl(tsval);
809                 rep.opt[2] = htonl(tsecr);
810                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
811         }
812
813         /* Swap the send and the receive. */
814         rep.th.dest    = th->source;
815         rep.th.source  = th->dest;
816         rep.th.doff    = arg.iov[0].iov_len / 4;
817         rep.th.seq     = htonl(seq);
818         rep.th.ack_seq = htonl(ack);
819         rep.th.ack     = 1;
820         rep.th.window  = htons(win);
821
822 #ifdef CONFIG_TCP_MD5SIG
823         if (key) {
824                 int offset = (tsecr) ? 3 : 0;
825
826                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
827                                           (TCPOPT_NOP << 16) |
828                                           (TCPOPT_MD5SIG << 8) |
829                                           TCPOLEN_MD5SIG);
830                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
831                 rep.th.doff = arg.iov[0].iov_len/4;
832
833                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
834                                     key, ip_hdr(skb)->saddr,
835                                     ip_hdr(skb)->daddr, &rep.th);
836         }
837 #endif
838         arg.flags = reply_flags;
839         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
840                                       ip_hdr(skb)->saddr, /* XXX */
841                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
842         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
843         if (oif)
844                 arg.bound_dev_if = oif;
845         arg.tos = tos;
846         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
847         local_bh_disable();
848         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
849         if (sk)
850                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
851                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
852         ip_send_unicast_reply(ctl_sk,
853                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
854                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
855                               &arg, arg.iov[0].iov_len);
856
857         ctl_sk->sk_mark = 0;
858         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
859         local_bh_enable();
860 }
861
862 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
863 {
864         struct inet_timewait_sock *tw = inet_twsk(sk);
865         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
866
867         tcp_v4_send_ack(sk, skb,
868                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
869                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
870                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
871                         tcptw->tw_ts_recent,
872                         tw->tw_bound_dev_if,
873                         tcp_twsk_md5_key(tcptw),
874                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
875                         tw->tw_tos
876                         );
877
878         inet_twsk_put(tw);
879 }
880
881 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
882                                   struct request_sock *req)
883 {
884         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
885          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
886          */
887         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
888                                              tcp_sk(sk)->snd_nxt;
889
890         /* RFC 7323 2.3
891          * The window field (SEG.WND) of every outgoing segment, with the
892          * exception of <SYN> segments, MUST be right-shifted by
893          * Rcv.Wind.Shift bits:
894          */
895         tcp_v4_send_ack(sk, skb, seq,
896                         tcp_rsk(req)->rcv_nxt,
897                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
898                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
899                         req->ts_recent,
900                         0,
901                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
902                                           AF_INET),
903                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
904                         ip_hdr(skb)->tos);
905 }
906
907 /*
908  *      Send a SYN-ACK after having received a SYN.
909  *      This still operates on a request_sock only, not on a big
910  *      socket.
911  */
912 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
913                               struct flowi *fl,
914                               struct request_sock *req,
915                               struct tcp_fastopen_cookie *foc,
916                               enum tcp_synack_type synack_type)
917 {
918         const struct inet_request_sock *ireq = inet_rsk(req);
919         struct flowi4 fl4;
920         int err = -1;
921         struct sk_buff *skb;
922
923         /* First, grab a route. */
924         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
925                 return -1;
926
927         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
928
929         if (skb) {
930                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
931
932                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
933                                             ireq->ir_rmt_addr,
934                                             ireq_opt_deref(ireq));
935                 err = net_xmit_eval(err);
936         }
937
938         return err;
939 }
940
941 /*
942  *      IPv4 request_sock destructor.
943  */
944 static void tcp_v4_reqsk_destructor(struct request_sock *req)
945 {
946         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
947 }
948
949 #ifdef CONFIG_TCP_MD5SIG
950 /*
951  * RFC2385 MD5 checksumming requires a mapping of
952  * IP address->MD5 Key.
953  * We need to maintain these in the sk structure.
954  */
955
956 /* Find the Key structure for an address.  */
957 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
958                                          const union tcp_md5_addr *addr,
959                                          int family)
960 {
961         const struct tcp_sock *tp = tcp_sk(sk);
962         struct tcp_md5sig_key *key;
963         const struct tcp_md5sig_info *md5sig;
964         __be32 mask;
965         struct tcp_md5sig_key *best_match = NULL;
966         bool match;
967
968         /* caller either holds rcu_read_lock() or socket lock */
969         md5sig = rcu_dereference_check(tp->md5sig_info,
970                                        lockdep_sock_is_held(sk));
971         if (!md5sig)
972                 return NULL;
973
974         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
975                 if (key->family != family)
976                         continue;
977
978                 if (family == AF_INET) {
979                         mask = inet_make_mask(key->prefixlen);
980                         match = (key->addr.a4.s_addr & mask) ==
981                                 (addr->a4.s_addr & mask);
982 #if IS_ENABLED(CONFIG_IPV6)
983                 } else if (family == AF_INET6) {
984                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
985                                                   key->prefixlen);
986 #endif
987                 } else {
988                         match = false;
989                 }
990
991                 if (match && (!best_match ||
992                               key->prefixlen > best_match->prefixlen))
993                         best_match = key;
994         }
995         return best_match;
996 }
997 EXPORT_SYMBOL(tcp_md5_do_lookup);
998
999 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1000                                                       const union tcp_md5_addr *addr,
1001                                                       int family, u8 prefixlen)
1002 {
1003         const struct tcp_sock *tp = tcp_sk(sk);
1004         struct tcp_md5sig_key *key;
1005         unsigned int size = sizeof(struct in_addr);
1006         const struct tcp_md5sig_info *md5sig;
1007
1008         /* caller either holds rcu_read_lock() or socket lock */
1009         md5sig = rcu_dereference_check(tp->md5sig_info,
1010                                        lockdep_sock_is_held(sk));
1011         if (!md5sig)
1012                 return NULL;
1013 #if IS_ENABLED(CONFIG_IPV6)
1014         if (family == AF_INET6)
1015                 size = sizeof(struct in6_addr);
1016 #endif
1017         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1018                 if (key->family != family)
1019                         continue;
1020                 if (!memcmp(&key->addr, addr, size) &&
1021                     key->prefixlen == prefixlen)
1022                         return key;
1023         }
1024         return NULL;
1025 }
1026
1027 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1028                                          const struct sock *addr_sk)
1029 {
1030         const union tcp_md5_addr *addr;
1031
1032         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1033         return tcp_md5_do_lookup(sk, addr, AF_INET);
1034 }
1035 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1036
1037 /* This can be called on a newly created socket, from other files */
1038 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1039                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1040                    gfp_t gfp)
1041 {
1042         /* Add Key to the list */
1043         struct tcp_md5sig_key *key;
1044         struct tcp_sock *tp = tcp_sk(sk);
1045         struct tcp_md5sig_info *md5sig;
1046
1047         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1048         if (key) {
1049                 /* Pre-existing entry - just update that one. */
1050                 memcpy(key->key, newkey, newkeylen);
1051                 key->keylen = newkeylen;
1052                 return 0;
1053         }
1054
1055         md5sig = rcu_dereference_protected(tp->md5sig_info,
1056                                            lockdep_sock_is_held(sk));
1057         if (!md5sig) {
1058                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1059                 if (!md5sig)
1060                         return -ENOMEM;
1061
1062                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1063                 INIT_HLIST_HEAD(&md5sig->head);
1064                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1065         }
1066
1067         key = sock_kmalloc(sk, sizeof(*key), gfp);
1068         if (!key)
1069                 return -ENOMEM;
1070         if (!tcp_alloc_md5sig_pool()) {
1071                 sock_kfree_s(sk, key, sizeof(*key));
1072                 return -ENOMEM;
1073         }
1074
1075         memcpy(key->key, newkey, newkeylen);
1076         key->keylen = newkeylen;
1077         key->family = family;
1078         key->prefixlen = prefixlen;
1079         memcpy(&key->addr, addr,
1080                (family == AF_INET6) ? sizeof(struct in6_addr) :
1081                                       sizeof(struct in_addr));
1082         hlist_add_head_rcu(&key->node, &md5sig->head);
1083         return 0;
1084 }
1085 EXPORT_SYMBOL(tcp_md5_do_add);
1086
1087 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1088                    u8 prefixlen)
1089 {
1090         struct tcp_md5sig_key *key;
1091
1092         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1093         if (!key)
1094                 return -ENOENT;
1095         hlist_del_rcu(&key->node);
1096         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1097         kfree_rcu(key, rcu);
1098         return 0;
1099 }
1100 EXPORT_SYMBOL(tcp_md5_do_del);
1101
1102 static void tcp_clear_md5_list(struct sock *sk)
1103 {
1104         struct tcp_sock *tp = tcp_sk(sk);
1105         struct tcp_md5sig_key *key;
1106         struct hlist_node *n;
1107         struct tcp_md5sig_info *md5sig;
1108
1109         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1110
1111         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1112                 hlist_del_rcu(&key->node);
1113                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1114                 kfree_rcu(key, rcu);
1115         }
1116 }
1117
1118 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1119                                  char __user *optval, int optlen)
1120 {
1121         struct tcp_md5sig cmd;
1122         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1123         u8 prefixlen = 32;
1124
1125         if (optlen < sizeof(cmd))
1126                 return -EINVAL;
1127
1128         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1129                 return -EFAULT;
1130
1131         if (sin->sin_family != AF_INET)
1132                 return -EINVAL;
1133
1134         if (optname == TCP_MD5SIG_EXT &&
1135             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1136                 prefixlen = cmd.tcpm_prefixlen;
1137                 if (prefixlen > 32)
1138                         return -EINVAL;
1139         }
1140
1141         if (!cmd.tcpm_keylen)
1142                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1143                                       AF_INET, prefixlen);
1144
1145         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1146                 return -EINVAL;
1147
1148         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1149                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1150                               GFP_KERNEL);
1151 }
1152
1153 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1154                                    __be32 daddr, __be32 saddr,
1155                                    const struct tcphdr *th, int nbytes)
1156 {
1157         struct tcp4_pseudohdr *bp;
1158         struct scatterlist sg;
1159         struct tcphdr *_th;
1160
1161         bp = hp->scratch;
1162         bp->saddr = saddr;
1163         bp->daddr = daddr;
1164         bp->pad = 0;
1165         bp->protocol = IPPROTO_TCP;
1166         bp->len = cpu_to_be16(nbytes);
1167
1168         _th = (struct tcphdr *)(bp + 1);
1169         memcpy(_th, th, sizeof(*th));
1170         _th->check = 0;
1171
1172         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1173         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1174                                 sizeof(*bp) + sizeof(*th));
1175         return crypto_ahash_update(hp->md5_req);
1176 }
1177
1178 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1179                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1180 {
1181         struct tcp_md5sig_pool *hp;
1182         struct ahash_request *req;
1183
1184         hp = tcp_get_md5sig_pool();
1185         if (!hp)
1186                 goto clear_hash_noput;
1187         req = hp->md5_req;
1188
1189         if (crypto_ahash_init(req))
1190                 goto clear_hash;
1191         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1192                 goto clear_hash;
1193         if (tcp_md5_hash_key(hp, key))
1194                 goto clear_hash;
1195         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1196         if (crypto_ahash_final(req))
1197                 goto clear_hash;
1198
1199         tcp_put_md5sig_pool();
1200         return 0;
1201
1202 clear_hash:
1203         tcp_put_md5sig_pool();
1204 clear_hash_noput:
1205         memset(md5_hash, 0, 16);
1206         return 1;
1207 }
1208
1209 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1210                         const struct sock *sk,
1211                         const struct sk_buff *skb)
1212 {
1213         struct tcp_md5sig_pool *hp;
1214         struct ahash_request *req;
1215         const struct tcphdr *th = tcp_hdr(skb);
1216         __be32 saddr, daddr;
1217
1218         if (sk) { /* valid for establish/request sockets */
1219                 saddr = sk->sk_rcv_saddr;
1220                 daddr = sk->sk_daddr;
1221         } else {
1222                 const struct iphdr *iph = ip_hdr(skb);
1223                 saddr = iph->saddr;
1224                 daddr = iph->daddr;
1225         }
1226
1227         hp = tcp_get_md5sig_pool();
1228         if (!hp)
1229                 goto clear_hash_noput;
1230         req = hp->md5_req;
1231
1232         if (crypto_ahash_init(req))
1233                 goto clear_hash;
1234
1235         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1236                 goto clear_hash;
1237         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1238                 goto clear_hash;
1239         if (tcp_md5_hash_key(hp, key))
1240                 goto clear_hash;
1241         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1242         if (crypto_ahash_final(req))
1243                 goto clear_hash;
1244
1245         tcp_put_md5sig_pool();
1246         return 0;
1247
1248 clear_hash:
1249         tcp_put_md5sig_pool();
1250 clear_hash_noput:
1251         memset(md5_hash, 0, 16);
1252         return 1;
1253 }
1254 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1255
1256 #endif
1257
1258 /* Called with rcu_read_lock() */
1259 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1260                                     const struct sk_buff *skb)
1261 {
1262 #ifdef CONFIG_TCP_MD5SIG
1263         /*
1264          * This gets called for each TCP segment that arrives
1265          * so we want to be efficient.
1266          * We have 3 drop cases:
1267          * o No MD5 hash and one expected.
1268          * o MD5 hash and we're not expecting one.
1269          * o MD5 hash and its wrong.
1270          */
1271         const __u8 *hash_location = NULL;
1272         struct tcp_md5sig_key *hash_expected;
1273         const struct iphdr *iph = ip_hdr(skb);
1274         const struct tcphdr *th = tcp_hdr(skb);
1275         int genhash;
1276         unsigned char newhash[16];
1277
1278         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1279                                           AF_INET);
1280         hash_location = tcp_parse_md5sig_option(th);
1281
1282         /* We've parsed the options - do we have a hash? */
1283         if (!hash_expected && !hash_location)
1284                 return false;
1285
1286         if (hash_expected && !hash_location) {
1287                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1288                 return true;
1289         }
1290
1291         if (!hash_expected && hash_location) {
1292                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1293                 return true;
1294         }
1295
1296         /* Okay, so this is hash_expected and hash_location -
1297          * so we need to calculate the checksum.
1298          */
1299         genhash = tcp_v4_md5_hash_skb(newhash,
1300                                       hash_expected,
1301                                       NULL, skb);
1302
1303         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1304                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1305                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1306                                      &iph->saddr, ntohs(th->source),
1307                                      &iph->daddr, ntohs(th->dest),
1308                                      genhash ? " tcp_v4_calc_md5_hash failed"
1309                                      : "");
1310                 return true;
1311         }
1312         return false;
1313 #endif
1314         return false;
1315 }
1316
1317 static void tcp_v4_init_req(struct request_sock *req,
1318                             const struct sock *sk_listener,
1319                             struct sk_buff *skb)
1320 {
1321         struct inet_request_sock *ireq = inet_rsk(req);
1322         struct net *net = sock_net(sk_listener);
1323
1324         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1325         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1326         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1327 }
1328
1329 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1330                                           struct flowi *fl,
1331                                           const struct request_sock *req)
1332 {
1333         return inet_csk_route_req(sk, &fl->u.ip4, req);
1334 }
1335
1336 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1337         .family         =       PF_INET,
1338         .obj_size       =       sizeof(struct tcp_request_sock),
1339         .rtx_syn_ack    =       tcp_rtx_synack,
1340         .send_ack       =       tcp_v4_reqsk_send_ack,
1341         .destructor     =       tcp_v4_reqsk_destructor,
1342         .send_reset     =       tcp_v4_send_reset,
1343         .syn_ack_timeout =      tcp_syn_ack_timeout,
1344 };
1345
1346 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1347         .mss_clamp      =       TCP_MSS_DEFAULT,
1348 #ifdef CONFIG_TCP_MD5SIG
1349         .req_md5_lookup =       tcp_v4_md5_lookup,
1350         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1351 #endif
1352         .init_req       =       tcp_v4_init_req,
1353 #ifdef CONFIG_SYN_COOKIES
1354         .cookie_init_seq =      cookie_v4_init_sequence,
1355 #endif
1356         .route_req      =       tcp_v4_route_req,
1357         .init_seq       =       tcp_v4_init_seq,
1358         .init_ts_off    =       tcp_v4_init_ts_off,
1359         .send_synack    =       tcp_v4_send_synack,
1360 };
1361
1362 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1363 {
1364         /* Never answer to SYNs send to broadcast or multicast */
1365         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1366                 goto drop;
1367
1368         return tcp_conn_request(&tcp_request_sock_ops,
1369                                 &tcp_request_sock_ipv4_ops, sk, skb);
1370
1371 drop:
1372         tcp_listendrop(sk);
1373         return 0;
1374 }
1375 EXPORT_SYMBOL(tcp_v4_conn_request);
1376
1377
1378 /*
1379  * The three way handshake has completed - we got a valid synack -
1380  * now create the new socket.
1381  */
1382 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1383                                   struct request_sock *req,
1384                                   struct dst_entry *dst,
1385                                   struct request_sock *req_unhash,
1386                                   bool *own_req)
1387 {
1388         struct inet_request_sock *ireq;
1389         struct inet_sock *newinet;
1390         struct tcp_sock *newtp;
1391         struct sock *newsk;
1392 #ifdef CONFIG_TCP_MD5SIG
1393         struct tcp_md5sig_key *key;
1394 #endif
1395         struct ip_options_rcu *inet_opt;
1396
1397         if (sk_acceptq_is_full(sk))
1398                 goto exit_overflow;
1399
1400         newsk = tcp_create_openreq_child(sk, req, skb);
1401         if (!newsk)
1402                 goto exit_nonewsk;
1403
1404         newsk->sk_gso_type = SKB_GSO_TCPV4;
1405         inet_sk_rx_dst_set(newsk, skb);
1406
1407         newtp                 = tcp_sk(newsk);
1408         newinet               = inet_sk(newsk);
1409         ireq                  = inet_rsk(req);
1410         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1411         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1412         newsk->sk_bound_dev_if = ireq->ir_iif;
1413         newinet->inet_saddr   = ireq->ir_loc_addr;
1414         inet_opt              = rcu_dereference(ireq->ireq_opt);
1415         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1416         newinet->mc_index     = inet_iif(skb);
1417         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1418         newinet->rcv_tos      = ip_hdr(skb)->tos;
1419         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1420         if (inet_opt)
1421                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1422         newinet->inet_id = newtp->write_seq ^ jiffies;
1423
1424         if (!dst) {
1425                 dst = inet_csk_route_child_sock(sk, newsk, req);
1426                 if (!dst)
1427                         goto put_and_exit;
1428         } else {
1429                 /* syncookie case : see end of cookie_v4_check() */
1430         }
1431         sk_setup_caps(newsk, dst);
1432
1433         tcp_ca_openreq_child(newsk, dst);
1434
1435         tcp_sync_mss(newsk, dst_mtu(dst));
1436         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1437
1438         tcp_initialize_rcv_mss(newsk);
1439
1440 #ifdef CONFIG_TCP_MD5SIG
1441         /* Copy over the MD5 key from the original socket */
1442         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1443                                 AF_INET);
1444         if (key) {
1445                 /*
1446                  * We're using one, so create a matching key
1447                  * on the newsk structure. If we fail to get
1448                  * memory, then we end up not copying the key
1449                  * across. Shucks.
1450                  */
1451                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1452                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1453                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1454         }
1455 #endif
1456
1457         if (__inet_inherit_port(sk, newsk) < 0)
1458                 goto put_and_exit;
1459         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1460         if (likely(*own_req)) {
1461                 tcp_move_syn(newtp, req);
1462                 ireq->ireq_opt = NULL;
1463         } else {
1464                 newinet->inet_opt = NULL;
1465         }
1466         return newsk;
1467
1468 exit_overflow:
1469         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1470 exit_nonewsk:
1471         dst_release(dst);
1472 exit:
1473         tcp_listendrop(sk);
1474         return NULL;
1475 put_and_exit:
1476         newinet->inet_opt = NULL;
1477         inet_csk_prepare_forced_close(newsk);
1478         tcp_done(newsk);
1479         goto exit;
1480 }
1481 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1482
1483 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1484 {
1485 #ifdef CONFIG_SYN_COOKIES
1486         const struct tcphdr *th = tcp_hdr(skb);
1487
1488         if (!th->syn)
1489                 sk = cookie_v4_check(sk, skb);
1490 #endif
1491         return sk;
1492 }
1493
1494 /* The socket must have it's spinlock held when we get
1495  * here, unless it is a TCP_LISTEN socket.
1496  *
1497  * We have a potential double-lock case here, so even when
1498  * doing backlog processing we use the BH locking scheme.
1499  * This is because we cannot sleep with the original spinlock
1500  * held.
1501  */
1502 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1503 {
1504         struct sock *rsk;
1505
1506         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1507                 struct dst_entry *dst = sk->sk_rx_dst;
1508
1509                 sock_rps_save_rxhash(sk, skb);
1510                 sk_mark_napi_id(sk, skb);
1511                 if (dst) {
1512                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1513                             !dst->ops->check(dst, 0)) {
1514                                 dst_release(dst);
1515                                 sk->sk_rx_dst = NULL;
1516                         }
1517                 }
1518                 tcp_rcv_established(sk, skb);
1519                 return 0;
1520         }
1521
1522         if (tcp_checksum_complete(skb))
1523                 goto csum_err;
1524
1525         if (sk->sk_state == TCP_LISTEN) {
1526                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1527
1528                 if (!nsk)
1529                         goto discard;
1530                 if (nsk != sk) {
1531                         if (tcp_child_process(sk, nsk, skb)) {
1532                                 rsk = nsk;
1533                                 goto reset;
1534                         }
1535                         return 0;
1536                 }
1537         } else
1538                 sock_rps_save_rxhash(sk, skb);
1539
1540         if (tcp_rcv_state_process(sk, skb)) {
1541                 rsk = sk;
1542                 goto reset;
1543         }
1544         return 0;
1545
1546 reset:
1547         tcp_v4_send_reset(rsk, skb);
1548 discard:
1549         kfree_skb(skb);
1550         /* Be careful here. If this function gets more complicated and
1551          * gcc suffers from register pressure on the x86, sk (in %ebx)
1552          * might be destroyed here. This current version compiles correctly,
1553          * but you have been warned.
1554          */
1555         return 0;
1556
1557 csum_err:
1558         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1559         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1560         goto discard;
1561 }
1562 EXPORT_SYMBOL(tcp_v4_do_rcv);
1563
1564 int tcp_v4_early_demux(struct sk_buff *skb)
1565 {
1566         const struct iphdr *iph;
1567         const struct tcphdr *th;
1568         struct sock *sk;
1569
1570         if (skb->pkt_type != PACKET_HOST)
1571                 return 0;
1572
1573         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1574                 return 0;
1575
1576         iph = ip_hdr(skb);
1577         th = tcp_hdr(skb);
1578
1579         if (th->doff < sizeof(struct tcphdr) / 4)
1580                 return 0;
1581
1582         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1583                                        iph->saddr, th->source,
1584                                        iph->daddr, ntohs(th->dest),
1585                                        skb->skb_iif, inet_sdif(skb));
1586         if (sk) {
1587                 skb->sk = sk;
1588                 skb->destructor = sock_edemux;
1589                 if (sk_fullsock(sk)) {
1590                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1591
1592                         if (dst)
1593                                 dst = dst_check(dst, 0);
1594                         if (dst &&
1595                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1596                                 skb_dst_set_noref(skb, dst);
1597                 }
1598         }
1599         return 0;
1600 }
1601
1602 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1603 {
1604         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1605
1606         /* Only socket owner can try to collapse/prune rx queues
1607          * to reduce memory overhead, so add a little headroom here.
1608          * Few sockets backlog are possibly concurrently non empty.
1609          */
1610         limit += 64*1024;
1611
1612         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1613          * we can fix skb->truesize to its real value to avoid future drops.
1614          * This is valid because skb is not yet charged to the socket.
1615          * It has been noticed pure SACK packets were sometimes dropped
1616          * (if cooked by drivers without copybreak feature).
1617          */
1618         skb_condense(skb);
1619
1620         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1621                 bh_unlock_sock(sk);
1622                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1623                 return true;
1624         }
1625         return false;
1626 }
1627 EXPORT_SYMBOL(tcp_add_backlog);
1628
1629 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1630 {
1631         struct tcphdr *th = (struct tcphdr *)skb->data;
1632         unsigned int eaten = skb->len;
1633         int err;
1634
1635         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1636         if (!err) {
1637                 eaten -= skb->len;
1638                 TCP_SKB_CB(skb)->end_seq -= eaten;
1639         }
1640         return err;
1641 }
1642 EXPORT_SYMBOL(tcp_filter);
1643
1644 static void tcp_v4_restore_cb(struct sk_buff *skb)
1645 {
1646         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1647                 sizeof(struct inet_skb_parm));
1648 }
1649
1650 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1651                            const struct tcphdr *th)
1652 {
1653         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1654          * barrier() makes sure compiler wont play fool^Waliasing games.
1655          */
1656         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1657                 sizeof(struct inet_skb_parm));
1658         barrier();
1659
1660         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1661         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1662                                     skb->len - th->doff * 4);
1663         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1664         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1665         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1666         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1667         TCP_SKB_CB(skb)->sacked  = 0;
1668         TCP_SKB_CB(skb)->has_rxtstamp =
1669                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1670 }
1671
1672 /*
1673  *      From tcp_input.c
1674  */
1675
1676 int tcp_v4_rcv(struct sk_buff *skb)
1677 {
1678         struct net *net = dev_net(skb->dev);
1679         int sdif = inet_sdif(skb);
1680         const struct iphdr *iph;
1681         const struct tcphdr *th;
1682         bool refcounted;
1683         struct sock *sk;
1684         int ret;
1685
1686         if (skb->pkt_type != PACKET_HOST)
1687                 goto discard_it;
1688
1689         /* Count it even if it's bad */
1690         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1691
1692         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1693                 goto discard_it;
1694
1695         th = (const struct tcphdr *)skb->data;
1696
1697         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1698                 goto bad_packet;
1699         if (!pskb_may_pull(skb, th->doff * 4))
1700                 goto discard_it;
1701
1702         /* An explanation is required here, I think.
1703          * Packet length and doff are validated by header prediction,
1704          * provided case of th->doff==0 is eliminated.
1705          * So, we defer the checks. */
1706
1707         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1708                 goto csum_error;
1709
1710         th = (const struct tcphdr *)skb->data;
1711         iph = ip_hdr(skb);
1712 lookup:
1713         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1714                                th->dest, sdif, &refcounted);
1715         if (!sk)
1716                 goto no_tcp_socket;
1717
1718 process:
1719         if (sk->sk_state == TCP_TIME_WAIT)
1720                 goto do_time_wait;
1721
1722         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1723                 struct request_sock *req = inet_reqsk(sk);
1724                 bool req_stolen = false;
1725                 struct sock *nsk;
1726
1727                 sk = req->rsk_listener;
1728                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1729                         sk_drops_add(sk, skb);
1730                         reqsk_put(req);
1731                         goto discard_it;
1732                 }
1733                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1734                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1735                         goto lookup;
1736                 }
1737                 /* We own a reference on the listener, increase it again
1738                  * as we might lose it too soon.
1739                  */
1740                 sock_hold(sk);
1741                 refcounted = true;
1742                 nsk = NULL;
1743                 if (!tcp_filter(sk, skb)) {
1744                         th = (const struct tcphdr *)skb->data;
1745                         iph = ip_hdr(skb);
1746                         tcp_v4_fill_cb(skb, iph, th);
1747                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1748                 }
1749                 if (!nsk) {
1750                         reqsk_put(req);
1751                         if (req_stolen) {
1752                                 /* Another cpu got exclusive access to req
1753                                  * and created a full blown socket.
1754                                  * Try to feed this packet to this socket
1755                                  * instead of discarding it.
1756                                  */
1757                                 tcp_v4_restore_cb(skb);
1758                                 sock_put(sk);
1759                                 goto lookup;
1760                         }
1761                         goto discard_and_relse;
1762                 }
1763                 if (nsk == sk) {
1764                         reqsk_put(req);
1765                         tcp_v4_restore_cb(skb);
1766                 } else if (tcp_child_process(sk, nsk, skb)) {
1767                         tcp_v4_send_reset(nsk, skb);
1768                         goto discard_and_relse;
1769                 } else {
1770                         sock_put(sk);
1771                         return 0;
1772                 }
1773         }
1774         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1775                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1776                 goto discard_and_relse;
1777         }
1778
1779         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1780                 goto discard_and_relse;
1781
1782         if (tcp_v4_inbound_md5_hash(sk, skb))
1783                 goto discard_and_relse;
1784
1785         nf_reset(skb);
1786
1787         if (tcp_filter(sk, skb))
1788                 goto discard_and_relse;
1789         th = (const struct tcphdr *)skb->data;
1790         iph = ip_hdr(skb);
1791         tcp_v4_fill_cb(skb, iph, th);
1792
1793         skb->dev = NULL;
1794
1795         if (sk->sk_state == TCP_LISTEN) {
1796                 ret = tcp_v4_do_rcv(sk, skb);
1797                 goto put_and_return;
1798         }
1799
1800         sk_incoming_cpu_update(sk);
1801
1802         bh_lock_sock_nested(sk);
1803         tcp_segs_in(tcp_sk(sk), skb);
1804         ret = 0;
1805         if (!sock_owned_by_user(sk)) {
1806                 ret = tcp_v4_do_rcv(sk, skb);
1807         } else if (tcp_add_backlog(sk, skb)) {
1808                 goto discard_and_relse;
1809         }
1810         bh_unlock_sock(sk);
1811
1812 put_and_return:
1813         if (refcounted)
1814                 sock_put(sk);
1815
1816         return ret;
1817
1818 no_tcp_socket:
1819         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1820                 goto discard_it;
1821
1822         tcp_v4_fill_cb(skb, iph, th);
1823
1824         if (tcp_checksum_complete(skb)) {
1825 csum_error:
1826                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1827 bad_packet:
1828                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1829         } else {
1830                 tcp_v4_send_reset(NULL, skb);
1831         }
1832
1833 discard_it:
1834         /* Discard frame. */
1835         kfree_skb(skb);
1836         return 0;
1837
1838 discard_and_relse:
1839         sk_drops_add(sk, skb);
1840         if (refcounted)
1841                 sock_put(sk);
1842         goto discard_it;
1843
1844 do_time_wait:
1845         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1846                 inet_twsk_put(inet_twsk(sk));
1847                 goto discard_it;
1848         }
1849
1850         tcp_v4_fill_cb(skb, iph, th);
1851
1852         if (tcp_checksum_complete(skb)) {
1853                 inet_twsk_put(inet_twsk(sk));
1854                 goto csum_error;
1855         }
1856         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1857         case TCP_TW_SYN: {
1858                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1859                                                         &tcp_hashinfo, skb,
1860                                                         __tcp_hdrlen(th),
1861                                                         iph->saddr, th->source,
1862                                                         iph->daddr, th->dest,
1863                                                         inet_iif(skb),
1864                                                         sdif);
1865                 if (sk2) {
1866                         inet_twsk_deschedule_put(inet_twsk(sk));
1867                         sk = sk2;
1868                         tcp_v4_restore_cb(skb);
1869                         refcounted = false;
1870                         goto process;
1871                 }
1872         }
1873                 /* to ACK */
1874                 /* fall through */
1875         case TCP_TW_ACK:
1876                 tcp_v4_timewait_ack(sk, skb);
1877                 break;
1878         case TCP_TW_RST:
1879                 tcp_v4_send_reset(sk, skb);
1880                 inet_twsk_deschedule_put(inet_twsk(sk));
1881                 goto discard_it;
1882         case TCP_TW_SUCCESS:;
1883         }
1884         goto discard_it;
1885 }
1886
1887 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1888         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1889         .twsk_unique    = tcp_twsk_unique,
1890         .twsk_destructor= tcp_twsk_destructor,
1891 };
1892
1893 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1894 {
1895         struct dst_entry *dst = skb_dst(skb);
1896
1897         if (dst && dst_hold_safe(dst)) {
1898                 sk->sk_rx_dst = dst;
1899                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1900         }
1901 }
1902 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1903
1904 const struct inet_connection_sock_af_ops ipv4_specific = {
1905         .queue_xmit        = ip_queue_xmit,
1906         .send_check        = tcp_v4_send_check,
1907         .rebuild_header    = inet_sk_rebuild_header,
1908         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1909         .conn_request      = tcp_v4_conn_request,
1910         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1911         .net_header_len    = sizeof(struct iphdr),
1912         .setsockopt        = ip_setsockopt,
1913         .getsockopt        = ip_getsockopt,
1914         .addr2sockaddr     = inet_csk_addr2sockaddr,
1915         .sockaddr_len      = sizeof(struct sockaddr_in),
1916 #ifdef CONFIG_COMPAT
1917         .compat_setsockopt = compat_ip_setsockopt,
1918         .compat_getsockopt = compat_ip_getsockopt,
1919 #endif
1920         .mtu_reduced       = tcp_v4_mtu_reduced,
1921 };
1922 EXPORT_SYMBOL(ipv4_specific);
1923
1924 #ifdef CONFIG_TCP_MD5SIG
1925 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1926         .md5_lookup             = tcp_v4_md5_lookup,
1927         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1928         .md5_parse              = tcp_v4_parse_md5_keys,
1929 };
1930 #endif
1931
1932 /* NOTE: A lot of things set to zero explicitly by call to
1933  *       sk_alloc() so need not be done here.
1934  */
1935 static int tcp_v4_init_sock(struct sock *sk)
1936 {
1937         struct inet_connection_sock *icsk = inet_csk(sk);
1938
1939         tcp_init_sock(sk);
1940
1941         icsk->icsk_af_ops = &ipv4_specific;
1942
1943 #ifdef CONFIG_TCP_MD5SIG
1944         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1945 #endif
1946
1947         return 0;
1948 }
1949
1950 void tcp_v4_destroy_sock(struct sock *sk)
1951 {
1952         struct tcp_sock *tp = tcp_sk(sk);
1953
1954         trace_tcp_destroy_sock(sk);
1955
1956         tcp_clear_xmit_timers(sk);
1957
1958         tcp_cleanup_congestion_control(sk);
1959
1960         tcp_cleanup_ulp(sk);
1961
1962         /* Cleanup up the write buffer. */
1963         tcp_write_queue_purge(sk);
1964
1965         /* Check if we want to disable active TFO */
1966         tcp_fastopen_active_disable_ofo_check(sk);
1967
1968         /* Cleans up our, hopefully empty, out_of_order_queue. */
1969         skb_rbtree_purge(&tp->out_of_order_queue);
1970
1971 #ifdef CONFIG_TCP_MD5SIG
1972         /* Clean up the MD5 key list, if any */
1973         if (tp->md5sig_info) {
1974                 tcp_clear_md5_list(sk);
1975                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1976                 tp->md5sig_info = NULL;
1977         }
1978 #endif
1979
1980         /* Clean up a referenced TCP bind bucket. */
1981         if (inet_csk(sk)->icsk_bind_hash)
1982                 inet_put_port(sk);
1983
1984         BUG_ON(tp->fastopen_rsk);
1985
1986         /* If socket is aborted during connect operation */
1987         tcp_free_fastopen_req(tp);
1988         tcp_fastopen_destroy_cipher(sk);
1989         tcp_saved_syn_free(tp);
1990
1991         sk_sockets_allocated_dec(sk);
1992 }
1993 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1994
1995 #ifdef CONFIG_PROC_FS
1996 /* Proc filesystem TCP sock list dumping. */
1997
1998 /*
1999  * Get next listener socket follow cur.  If cur is NULL, get first socket
2000  * starting from bucket given in st->bucket; when st->bucket is zero the
2001  * very first socket in the hash table is returned.
2002  */
2003 static void *listening_get_next(struct seq_file *seq, void *cur)
2004 {
2005         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2006         struct tcp_iter_state *st = seq->private;
2007         struct net *net = seq_file_net(seq);
2008         struct inet_listen_hashbucket *ilb;
2009         struct sock *sk = cur;
2010
2011         if (!sk) {
2012 get_head:
2013                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2014                 spin_lock(&ilb->lock);
2015                 sk = sk_head(&ilb->head);
2016                 st->offset = 0;
2017                 goto get_sk;
2018         }
2019         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2020         ++st->num;
2021         ++st->offset;
2022
2023         sk = sk_next(sk);
2024 get_sk:
2025         sk_for_each_from(sk) {
2026                 if (!net_eq(sock_net(sk), net))
2027                         continue;
2028                 if (sk->sk_family == afinfo->family)
2029                         return sk;
2030         }
2031         spin_unlock(&ilb->lock);
2032         st->offset = 0;
2033         if (++st->bucket < INET_LHTABLE_SIZE)
2034                 goto get_head;
2035         return NULL;
2036 }
2037
2038 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2039 {
2040         struct tcp_iter_state *st = seq->private;
2041         void *rc;
2042
2043         st->bucket = 0;
2044         st->offset = 0;
2045         rc = listening_get_next(seq, NULL);
2046
2047         while (rc && *pos) {
2048                 rc = listening_get_next(seq, rc);
2049                 --*pos;
2050         }
2051         return rc;
2052 }
2053
2054 static inline bool empty_bucket(const struct tcp_iter_state *st)
2055 {
2056         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2057 }
2058
2059 /*
2060  * Get first established socket starting from bucket given in st->bucket.
2061  * If st->bucket is zero, the very first socket in the hash is returned.
2062  */
2063 static void *established_get_first(struct seq_file *seq)
2064 {
2065         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2066         struct tcp_iter_state *st = seq->private;
2067         struct net *net = seq_file_net(seq);
2068         void *rc = NULL;
2069
2070         st->offset = 0;
2071         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2072                 struct sock *sk;
2073                 struct hlist_nulls_node *node;
2074                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2075
2076                 /* Lockless fast path for the common case of empty buckets */
2077                 if (empty_bucket(st))
2078                         continue;
2079
2080                 spin_lock_bh(lock);
2081                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2082                         if (sk->sk_family != afinfo->family ||
2083                             !net_eq(sock_net(sk), net)) {
2084                                 continue;
2085                         }
2086                         rc = sk;
2087                         goto out;
2088                 }
2089                 spin_unlock_bh(lock);
2090         }
2091 out:
2092         return rc;
2093 }
2094
2095 static void *established_get_next(struct seq_file *seq, void *cur)
2096 {
2097         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2098         struct sock *sk = cur;
2099         struct hlist_nulls_node *node;
2100         struct tcp_iter_state *st = seq->private;
2101         struct net *net = seq_file_net(seq);
2102
2103         ++st->num;
2104         ++st->offset;
2105
2106         sk = sk_nulls_next(sk);
2107
2108         sk_nulls_for_each_from(sk, node) {
2109                 if (sk->sk_family == afinfo->family &&
2110                     net_eq(sock_net(sk), net))
2111                         return sk;
2112         }
2113
2114         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2115         ++st->bucket;
2116         return established_get_first(seq);
2117 }
2118
2119 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2120 {
2121         struct tcp_iter_state *st = seq->private;
2122         void *rc;
2123
2124         st->bucket = 0;
2125         rc = established_get_first(seq);
2126
2127         while (rc && pos) {
2128                 rc = established_get_next(seq, rc);
2129                 --pos;
2130         }
2131         return rc;
2132 }
2133
2134 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2135 {
2136         void *rc;
2137         struct tcp_iter_state *st = seq->private;
2138
2139         st->state = TCP_SEQ_STATE_LISTENING;
2140         rc        = listening_get_idx(seq, &pos);
2141
2142         if (!rc) {
2143                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2144                 rc        = established_get_idx(seq, pos);
2145         }
2146
2147         return rc;
2148 }
2149
2150 static void *tcp_seek_last_pos(struct seq_file *seq)
2151 {
2152         struct tcp_iter_state *st = seq->private;
2153         int offset = st->offset;
2154         int orig_num = st->num;
2155         void *rc = NULL;
2156
2157         switch (st->state) {
2158         case TCP_SEQ_STATE_LISTENING:
2159                 if (st->bucket >= INET_LHTABLE_SIZE)
2160                         break;
2161                 st->state = TCP_SEQ_STATE_LISTENING;
2162                 rc = listening_get_next(seq, NULL);
2163                 while (offset-- && rc)
2164                         rc = listening_get_next(seq, rc);
2165                 if (rc)
2166                         break;
2167                 st->bucket = 0;
2168                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2169                 /* Fallthrough */
2170         case TCP_SEQ_STATE_ESTABLISHED:
2171                 if (st->bucket > tcp_hashinfo.ehash_mask)
2172                         break;
2173                 rc = established_get_first(seq);
2174                 while (offset-- && rc)
2175                         rc = established_get_next(seq, rc);
2176         }
2177
2178         st->num = orig_num;
2179
2180         return rc;
2181 }
2182
2183 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2184 {
2185         struct tcp_iter_state *st = seq->private;
2186         void *rc;
2187
2188         if (*pos && *pos == st->last_pos) {
2189                 rc = tcp_seek_last_pos(seq);
2190                 if (rc)
2191                         goto out;
2192         }
2193
2194         st->state = TCP_SEQ_STATE_LISTENING;
2195         st->num = 0;
2196         st->bucket = 0;
2197         st->offset = 0;
2198         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2199
2200 out:
2201         st->last_pos = *pos;
2202         return rc;
2203 }
2204 EXPORT_SYMBOL(tcp_seq_start);
2205
2206 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2207 {
2208         struct tcp_iter_state *st = seq->private;
2209         void *rc = NULL;
2210
2211         if (v == SEQ_START_TOKEN) {
2212                 rc = tcp_get_idx(seq, 0);
2213                 goto out;
2214         }
2215
2216         switch (st->state) {
2217         case TCP_SEQ_STATE_LISTENING:
2218                 rc = listening_get_next(seq, v);
2219                 if (!rc) {
2220                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2221                         st->bucket = 0;
2222                         st->offset = 0;
2223                         rc        = established_get_first(seq);
2224                 }
2225                 break;
2226         case TCP_SEQ_STATE_ESTABLISHED:
2227                 rc = established_get_next(seq, v);
2228                 break;
2229         }
2230 out:
2231         ++*pos;
2232         st->last_pos = *pos;
2233         return rc;
2234 }
2235 EXPORT_SYMBOL(tcp_seq_next);
2236
2237 void tcp_seq_stop(struct seq_file *seq, void *v)
2238 {
2239         struct tcp_iter_state *st = seq->private;
2240
2241         switch (st->state) {
2242         case TCP_SEQ_STATE_LISTENING:
2243                 if (v != SEQ_START_TOKEN)
2244                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2245                 break;
2246         case TCP_SEQ_STATE_ESTABLISHED:
2247                 if (v)
2248                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2249                 break;
2250         }
2251 }
2252 EXPORT_SYMBOL(tcp_seq_stop);
2253
2254 static void get_openreq4(const struct request_sock *req,
2255                          struct seq_file *f, int i)
2256 {
2257         const struct inet_request_sock *ireq = inet_rsk(req);
2258         long delta = req->rsk_timer.expires - jiffies;
2259
2260         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2261                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2262                 i,
2263                 ireq->ir_loc_addr,
2264                 ireq->ir_num,
2265                 ireq->ir_rmt_addr,
2266                 ntohs(ireq->ir_rmt_port),
2267                 TCP_SYN_RECV,
2268                 0, 0, /* could print option size, but that is af dependent. */
2269                 1,    /* timers active (only the expire timer) */
2270                 jiffies_delta_to_clock_t(delta),
2271                 req->num_timeout,
2272                 from_kuid_munged(seq_user_ns(f),
2273                                  sock_i_uid(req->rsk_listener)),
2274                 0,  /* non standard timer */
2275                 0, /* open_requests have no inode */
2276                 0,
2277                 req);
2278 }
2279
2280 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2281 {
2282         int timer_active;
2283         unsigned long timer_expires;
2284         const struct tcp_sock *tp = tcp_sk(sk);
2285         const struct inet_connection_sock *icsk = inet_csk(sk);
2286         const struct inet_sock *inet = inet_sk(sk);
2287         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2288         __be32 dest = inet->inet_daddr;
2289         __be32 src = inet->inet_rcv_saddr;
2290         __u16 destp = ntohs(inet->inet_dport);
2291         __u16 srcp = ntohs(inet->inet_sport);
2292         int rx_queue;
2293         int state;
2294
2295         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2296             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2297             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2298                 timer_active    = 1;
2299                 timer_expires   = icsk->icsk_timeout;
2300         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2301                 timer_active    = 4;
2302                 timer_expires   = icsk->icsk_timeout;
2303         } else if (timer_pending(&sk->sk_timer)) {
2304                 timer_active    = 2;
2305                 timer_expires   = sk->sk_timer.expires;
2306         } else {
2307                 timer_active    = 0;
2308                 timer_expires = jiffies;
2309         }
2310
2311         state = inet_sk_state_load(sk);
2312         if (state == TCP_LISTEN)
2313                 rx_queue = sk->sk_ack_backlog;
2314         else
2315                 /* Because we don't lock the socket,
2316                  * we might find a transient negative value.
2317                  */
2318                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2319
2320         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2321                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2322                 i, src, srcp, dest, destp, state,
2323                 tp->write_seq - tp->snd_una,
2324                 rx_queue,
2325                 timer_active,
2326                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2327                 icsk->icsk_retransmits,
2328                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2329                 icsk->icsk_probes_out,
2330                 sock_i_ino(sk),
2331                 refcount_read(&sk->sk_refcnt), sk,
2332                 jiffies_to_clock_t(icsk->icsk_rto),
2333                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2334                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2335                 tp->snd_cwnd,
2336                 state == TCP_LISTEN ?
2337                     fastopenq->max_qlen :
2338                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2339 }
2340
2341 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2342                                struct seq_file *f, int i)
2343 {
2344         long delta = tw->tw_timer.expires - jiffies;
2345         __be32 dest, src;
2346         __u16 destp, srcp;
2347
2348         dest  = tw->tw_daddr;
2349         src   = tw->tw_rcv_saddr;
2350         destp = ntohs(tw->tw_dport);
2351         srcp  = ntohs(tw->tw_sport);
2352
2353         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2354                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2355                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2356                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2357                 refcount_read(&tw->tw_refcnt), tw);
2358 }
2359
2360 #define TMPSZ 150
2361
2362 static int tcp4_seq_show(struct seq_file *seq, void *v)
2363 {
2364         struct tcp_iter_state *st;
2365         struct sock *sk = v;
2366
2367         seq_setwidth(seq, TMPSZ - 1);
2368         if (v == SEQ_START_TOKEN) {
2369                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2370                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2371                            "inode");
2372                 goto out;
2373         }
2374         st = seq->private;
2375
2376         if (sk->sk_state == TCP_TIME_WAIT)
2377                 get_timewait4_sock(v, seq, st->num);
2378         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2379                 get_openreq4(v, seq, st->num);
2380         else
2381                 get_tcp4_sock(v, seq, st->num);
2382 out:
2383         seq_pad(seq, '\n');
2384         return 0;
2385 }
2386
2387 static const struct seq_operations tcp4_seq_ops = {
2388         .show           = tcp4_seq_show,
2389         .start          = tcp_seq_start,
2390         .next           = tcp_seq_next,
2391         .stop           = tcp_seq_stop,
2392 };
2393
2394 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2395         .family         = AF_INET,
2396 };
2397
2398 static int __net_init tcp4_proc_init_net(struct net *net)
2399 {
2400         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2401                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2402                 return -ENOMEM;
2403         return 0;
2404 }
2405
2406 static void __net_exit tcp4_proc_exit_net(struct net *net)
2407 {
2408         remove_proc_entry("tcp", net->proc_net);
2409 }
2410
2411 static struct pernet_operations tcp4_net_ops = {
2412         .init = tcp4_proc_init_net,
2413         .exit = tcp4_proc_exit_net,
2414 };
2415
2416 int __init tcp4_proc_init(void)
2417 {
2418         return register_pernet_subsys(&tcp4_net_ops);
2419 }
2420
2421 void tcp4_proc_exit(void)
2422 {
2423         unregister_pernet_subsys(&tcp4_net_ops);
2424 }
2425 #endif /* CONFIG_PROC_FS */
2426
2427 struct proto tcp_prot = {
2428         .name                   = "TCP",
2429         .owner                  = THIS_MODULE,
2430         .close                  = tcp_close,
2431         .pre_connect            = tcp_v4_pre_connect,
2432         .connect                = tcp_v4_connect,
2433         .disconnect             = tcp_disconnect,
2434         .accept                 = inet_csk_accept,
2435         .ioctl                  = tcp_ioctl,
2436         .init                   = tcp_v4_init_sock,
2437         .destroy                = tcp_v4_destroy_sock,
2438         .shutdown               = tcp_shutdown,
2439         .setsockopt             = tcp_setsockopt,
2440         .getsockopt             = tcp_getsockopt,
2441         .keepalive              = tcp_set_keepalive,
2442         .recvmsg                = tcp_recvmsg,
2443         .sendmsg                = tcp_sendmsg,
2444         .sendpage               = tcp_sendpage,
2445         .backlog_rcv            = tcp_v4_do_rcv,
2446         .release_cb             = tcp_release_cb,
2447         .hash                   = inet_hash,
2448         .unhash                 = inet_unhash,
2449         .get_port               = inet_csk_get_port,
2450         .enter_memory_pressure  = tcp_enter_memory_pressure,
2451         .leave_memory_pressure  = tcp_leave_memory_pressure,
2452         .stream_memory_free     = tcp_stream_memory_free,
2453         .sockets_allocated      = &tcp_sockets_allocated,
2454         .orphan_count           = &tcp_orphan_count,
2455         .memory_allocated       = &tcp_memory_allocated,
2456         .memory_pressure        = &tcp_memory_pressure,
2457         .sysctl_mem             = sysctl_tcp_mem,
2458         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2459         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2460         .max_header             = MAX_TCP_HEADER,
2461         .obj_size               = sizeof(struct tcp_sock),
2462         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2463         .twsk_prot              = &tcp_timewait_sock_ops,
2464         .rsk_prot               = &tcp_request_sock_ops,
2465         .h.hashinfo             = &tcp_hashinfo,
2466         .no_autobind            = true,
2467 #ifdef CONFIG_COMPAT
2468         .compat_setsockopt      = compat_tcp_setsockopt,
2469         .compat_getsockopt      = compat_tcp_getsockopt,
2470 #endif
2471         .diag_destroy           = tcp_abort,
2472 };
2473 EXPORT_SYMBOL(tcp_prot);
2474
2475 static void __net_exit tcp_sk_exit(struct net *net)
2476 {
2477         int cpu;
2478
2479         module_put(net->ipv4.tcp_congestion_control->owner);
2480
2481         for_each_possible_cpu(cpu)
2482                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2483         free_percpu(net->ipv4.tcp_sk);
2484 }
2485
2486 static int __net_init tcp_sk_init(struct net *net)
2487 {
2488         int res, cpu, cnt;
2489
2490         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2491         if (!net->ipv4.tcp_sk)
2492                 return -ENOMEM;
2493
2494         for_each_possible_cpu(cpu) {
2495                 struct sock *sk;
2496
2497                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2498                                            IPPROTO_TCP, net);
2499                 if (res)
2500                         goto fail;
2501                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2502                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2503         }
2504
2505         net->ipv4.sysctl_tcp_ecn = 2;
2506         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2507
2508         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2509         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2510         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2511
2512         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2513         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2514         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2515
2516         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2517         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2518         net->ipv4.sysctl_tcp_syncookies = 1;
2519         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2520         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2521         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2522         net->ipv4.sysctl_tcp_orphan_retries = 0;
2523         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2524         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2525         net->ipv4.sysctl_tcp_tw_reuse = 2;
2526
2527         cnt = tcp_hashinfo.ehash_mask + 1;
2528         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2529         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2530
2531         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2532         net->ipv4.sysctl_tcp_sack = 1;
2533         net->ipv4.sysctl_tcp_window_scaling = 1;
2534         net->ipv4.sysctl_tcp_timestamps = 1;
2535         net->ipv4.sysctl_tcp_early_retrans = 3;
2536         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2537         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2538         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2539         net->ipv4.sysctl_tcp_max_reordering = 300;
2540         net->ipv4.sysctl_tcp_dsack = 1;
2541         net->ipv4.sysctl_tcp_app_win = 31;
2542         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2543         net->ipv4.sysctl_tcp_frto = 2;
2544         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2545         /* This limits the percentage of the congestion window which we
2546          * will allow a single TSO frame to consume.  Building TSO frames
2547          * which are too large can cause TCP streams to be bursty.
2548          */
2549         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2550         /* Default TSQ limit of four TSO segments */
2551         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2552         /* rfc5961 challenge ack rate limiting */
2553         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2554         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2555         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2556         net->ipv4.sysctl_tcp_autocorking = 1;
2557         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2558         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2559         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2560         if (net != &init_net) {
2561                 memcpy(net->ipv4.sysctl_tcp_rmem,
2562                        init_net.ipv4.sysctl_tcp_rmem,
2563                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2564                 memcpy(net->ipv4.sysctl_tcp_wmem,
2565                        init_net.ipv4.sysctl_tcp_wmem,
2566                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2567         }
2568         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2569         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2570         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2571         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2572         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2573         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2574
2575         /* Reno is always built in */
2576         if (!net_eq(net, &init_net) &&
2577             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2578                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2579         else
2580                 net->ipv4.tcp_congestion_control = &tcp_reno;
2581
2582         return 0;
2583 fail:
2584         tcp_sk_exit(net);
2585
2586         return res;
2587 }
2588
2589 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2590 {
2591         struct net *net;
2592
2593         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2594
2595         list_for_each_entry(net, net_exit_list, exit_list)
2596                 tcp_fastopen_ctx_destroy(net);
2597 }
2598
2599 static struct pernet_operations __net_initdata tcp_sk_ops = {
2600        .init       = tcp_sk_init,
2601        .exit       = tcp_sk_exit,
2602        .exit_batch = tcp_sk_exit_batch,
2603 };
2604
2605 void __init tcp_v4_init(void)
2606 {
2607         if (register_pernet_subsys(&tcp_sk_ops))
2608                 panic("Failed to create the TCP control socket.\n");
2609 }