Linux-libre 3.10.48-gnu
[librecmc/linux-libre.git] / net / netfilter / ipvs / ip_vs_proto_tcp.c
1 /*
2  * ip_vs_proto_tcp.c:   TCP load balancing support for IPVS
3  *
4  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
5  *              Julian Anastasov <ja@ssi.bg>
6  *
7  *              This program is free software; you can redistribute it and/or
8  *              modify it under the terms of the GNU General Public License
9  *              as published by the Free Software Foundation; either version
10  *              2 of the License, or (at your option) any later version.
11  *
12  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
13  *
14  *              Network name space (netns) aware.
15  *              Global data moved to netns i.e struct netns_ipvs
16  *              tcp_timeouts table has copy per netns in a hash table per
17  *              protocol ip_vs_proto_data and is handled by netns
18  */
19
20 #define KMSG_COMPONENT "IPVS"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
23 #include <linux/kernel.h>
24 #include <linux/ip.h>
25 #include <linux/tcp.h>                  /* for tcphdr */
26 #include <net/ip.h>
27 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
28 #include <net/ip6_checksum.h>
29 #include <linux/netfilter.h>
30 #include <linux/netfilter_ipv4.h>
31
32 #include <net/ip_vs.h>
33
34 static int
35 tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
36                   int *verdict, struct ip_vs_conn **cpp,
37                   struct ip_vs_iphdr *iph)
38 {
39         struct net *net;
40         struct ip_vs_service *svc;
41         struct tcphdr _tcph, *th;
42
43         th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
44         if (th == NULL) {
45                 *verdict = NF_DROP;
46                 return 0;
47         }
48         net = skb_net(skb);
49         /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
50         rcu_read_lock();
51         if (th->syn &&
52             (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
53                                       &iph->daddr, th->dest))) {
54                 int ignored;
55
56                 if (ip_vs_todrop(net_ipvs(net))) {
57                         /*
58                          * It seems that we are very loaded.
59                          * We have to drop this packet :(
60                          */
61                         rcu_read_unlock();
62                         *verdict = NF_DROP;
63                         return 0;
64                 }
65
66                 /*
67                  * Let the virtual server select a real server for the
68                  * incoming connection, and create a connection entry.
69                  */
70                 *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
71                 if (!*cpp && ignored <= 0) {
72                         if (!ignored)
73                                 *verdict = ip_vs_leave(svc, skb, pd, iph);
74                         else
75                                 *verdict = NF_DROP;
76                         rcu_read_unlock();
77                         return 0;
78                 }
79         }
80         rcu_read_unlock();
81         /* NF_ACCEPT */
82         return 1;
83 }
84
85
86 static inline void
87 tcp_fast_csum_update(int af, struct tcphdr *tcph,
88                      const union nf_inet_addr *oldip,
89                      const union nf_inet_addr *newip,
90                      __be16 oldport, __be16 newport)
91 {
92 #ifdef CONFIG_IP_VS_IPV6
93         if (af == AF_INET6)
94                 tcph->check =
95                         csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
96                                          ip_vs_check_diff2(oldport, newport,
97                                                 ~csum_unfold(tcph->check))));
98         else
99 #endif
100         tcph->check =
101                 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
102                                  ip_vs_check_diff2(oldport, newport,
103                                                 ~csum_unfold(tcph->check))));
104 }
105
106
107 static inline void
108 tcp_partial_csum_update(int af, struct tcphdr *tcph,
109                      const union nf_inet_addr *oldip,
110                      const union nf_inet_addr *newip,
111                      __be16 oldlen, __be16 newlen)
112 {
113 #ifdef CONFIG_IP_VS_IPV6
114         if (af == AF_INET6)
115                 tcph->check =
116                         ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
117                                          ip_vs_check_diff2(oldlen, newlen,
118                                                 csum_unfold(tcph->check))));
119         else
120 #endif
121         tcph->check =
122                 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
123                                 ip_vs_check_diff2(oldlen, newlen,
124                                                 csum_unfold(tcph->check))));
125 }
126
127
128 static int
129 tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
130                  struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
131 {
132         struct tcphdr *tcph;
133         unsigned int tcphoff = iph->len;
134         int oldlen;
135         int payload_csum = 0;
136
137 #ifdef CONFIG_IP_VS_IPV6
138         if (cp->af == AF_INET6 && iph->fragoffs)
139                 return 1;
140 #endif
141         oldlen = skb->len - tcphoff;
142
143         /* csum_check requires unshared skb */
144         if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
145                 return 0;
146
147         if (unlikely(cp->app != NULL)) {
148                 int ret;
149
150                 /* Some checks before mangling */
151                 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
152                         return 0;
153
154                 /* Call application helper if needed */
155                 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
156                         return 0;
157                 /* ret=2: csum update is needed after payload mangling */
158                 if (ret == 1)
159                         oldlen = skb->len - tcphoff;
160                 else
161                         payload_csum = 1;
162         }
163
164         tcph = (void *)skb_network_header(skb) + tcphoff;
165         tcph->source = cp->vport;
166
167         /* Adjust TCP checksums */
168         if (skb->ip_summed == CHECKSUM_PARTIAL) {
169                 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
170                                         htons(oldlen),
171                                         htons(skb->len - tcphoff));
172         } else if (!payload_csum) {
173                 /* Only port and addr are changed, do fast csum update */
174                 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
175                                      cp->dport, cp->vport);
176                 if (skb->ip_summed == CHECKSUM_COMPLETE)
177                         skb->ip_summed = (cp->app && pp->csum_check) ?
178                                          CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
179         } else {
180                 /* full checksum calculation */
181                 tcph->check = 0;
182                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
183 #ifdef CONFIG_IP_VS_IPV6
184                 if (cp->af == AF_INET6)
185                         tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
186                                                       &cp->caddr.in6,
187                                                       skb->len - tcphoff,
188                                                       cp->protocol, skb->csum);
189                 else
190 #endif
191                         tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
192                                                         cp->caddr.ip,
193                                                         skb->len - tcphoff,
194                                                         cp->protocol,
195                                                         skb->csum);
196                 skb->ip_summed = CHECKSUM_UNNECESSARY;
197
198                 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
199                           pp->name, tcph->check,
200                           (char*)&(tcph->check) - (char*)tcph);
201         }
202         return 1;
203 }
204
205
206 static int
207 tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
208                  struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
209 {
210         struct tcphdr *tcph;
211         unsigned int tcphoff = iph->len;
212         int oldlen;
213         int payload_csum = 0;
214
215 #ifdef CONFIG_IP_VS_IPV6
216         if (cp->af == AF_INET6 && iph->fragoffs)
217                 return 1;
218 #endif
219         oldlen = skb->len - tcphoff;
220
221         /* csum_check requires unshared skb */
222         if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
223                 return 0;
224
225         if (unlikely(cp->app != NULL)) {
226                 int ret;
227
228                 /* Some checks before mangling */
229                 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
230                         return 0;
231
232                 /*
233                  *      Attempt ip_vs_app call.
234                  *      It will fix ip_vs_conn and iph ack_seq stuff
235                  */
236                 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
237                         return 0;
238                 /* ret=2: csum update is needed after payload mangling */
239                 if (ret == 1)
240                         oldlen = skb->len - tcphoff;
241                 else
242                         payload_csum = 1;
243         }
244
245         tcph = (void *)skb_network_header(skb) + tcphoff;
246         tcph->dest = cp->dport;
247
248         /*
249          *      Adjust TCP checksums
250          */
251         if (skb->ip_summed == CHECKSUM_PARTIAL) {
252                 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
253                                         htons(oldlen),
254                                         htons(skb->len - tcphoff));
255         } else if (!payload_csum) {
256                 /* Only port and addr are changed, do fast csum update */
257                 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
258                                      cp->vport, cp->dport);
259                 if (skb->ip_summed == CHECKSUM_COMPLETE)
260                         skb->ip_summed = (cp->app && pp->csum_check) ?
261                                          CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
262         } else {
263                 /* full checksum calculation */
264                 tcph->check = 0;
265                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
266 #ifdef CONFIG_IP_VS_IPV6
267                 if (cp->af == AF_INET6)
268                         tcph->check = csum_ipv6_magic(&cp->caddr.in6,
269                                                       &cp->daddr.in6,
270                                                       skb->len - tcphoff,
271                                                       cp->protocol, skb->csum);
272                 else
273 #endif
274                         tcph->check = csum_tcpudp_magic(cp->caddr.ip,
275                                                         cp->daddr.ip,
276                                                         skb->len - tcphoff,
277                                                         cp->protocol,
278                                                         skb->csum);
279                 skb->ip_summed = CHECKSUM_UNNECESSARY;
280         }
281         return 1;
282 }
283
284
285 static int
286 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
287 {
288         unsigned int tcphoff;
289
290 #ifdef CONFIG_IP_VS_IPV6
291         if (af == AF_INET6)
292                 tcphoff = sizeof(struct ipv6hdr);
293         else
294 #endif
295                 tcphoff = ip_hdrlen(skb);
296
297         switch (skb->ip_summed) {
298         case CHECKSUM_NONE:
299                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
300         case CHECKSUM_COMPLETE:
301 #ifdef CONFIG_IP_VS_IPV6
302                 if (af == AF_INET6) {
303                         if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
304                                             &ipv6_hdr(skb)->daddr,
305                                             skb->len - tcphoff,
306                                             ipv6_hdr(skb)->nexthdr,
307                                             skb->csum)) {
308                                 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
309                                                  "Failed checksum for");
310                                 return 0;
311                         }
312                 } else
313 #endif
314                         if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
315                                               ip_hdr(skb)->daddr,
316                                               skb->len - tcphoff,
317                                               ip_hdr(skb)->protocol,
318                                               skb->csum)) {
319                                 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
320                                                  "Failed checksum for");
321                                 return 0;
322                         }
323                 break;
324         default:
325                 /* No need to checksum. */
326                 break;
327         }
328
329         return 1;
330 }
331
332
333 #define TCP_DIR_INPUT           0
334 #define TCP_DIR_OUTPUT          4
335 #define TCP_DIR_INPUT_ONLY      8
336
337 static const int tcp_state_off[IP_VS_DIR_LAST] = {
338         [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
339         [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
340         [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
341 };
342
343 /*
344  *      Timeout table[state]
345  */
346 static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
347         [IP_VS_TCP_S_NONE]              =       2*HZ,
348         [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
349         [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
350         [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
351         [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
352         [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
353         [IP_VS_TCP_S_CLOSE]             =       10*HZ,
354         [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
355         [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
356         [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
357         [IP_VS_TCP_S_SYNACK]            =       120*HZ,
358         [IP_VS_TCP_S_LAST]              =       2*HZ,
359 };
360
361 static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
362         [IP_VS_TCP_S_NONE]              =       "NONE",
363         [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
364         [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
365         [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
366         [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
367         [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
368         [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
369         [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
370         [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
371         [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
372         [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
373         [IP_VS_TCP_S_LAST]              =       "BUG!",
374 };
375
376 #define sNO IP_VS_TCP_S_NONE
377 #define sES IP_VS_TCP_S_ESTABLISHED
378 #define sSS IP_VS_TCP_S_SYN_SENT
379 #define sSR IP_VS_TCP_S_SYN_RECV
380 #define sFW IP_VS_TCP_S_FIN_WAIT
381 #define sTW IP_VS_TCP_S_TIME_WAIT
382 #define sCL IP_VS_TCP_S_CLOSE
383 #define sCW IP_VS_TCP_S_CLOSE_WAIT
384 #define sLA IP_VS_TCP_S_LAST_ACK
385 #define sLI IP_VS_TCP_S_LISTEN
386 #define sSA IP_VS_TCP_S_SYNACK
387
388 struct tcp_states_t {
389         int next_state[IP_VS_TCP_S_LAST];
390 };
391
392 static const char * tcp_state_name(int state)
393 {
394         if (state >= IP_VS_TCP_S_LAST)
395                 return "ERR!";
396         return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
397 }
398
399 static struct tcp_states_t tcp_states [] = {
400 /*      INPUT */
401 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
402 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
403 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
404 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
405 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
406
407 /*      OUTPUT */
408 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
409 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
410 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
411 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
412 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
413
414 /*      INPUT-ONLY */
415 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
416 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
417 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
418 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
419 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
420 };
421
422 static struct tcp_states_t tcp_states_dos [] = {
423 /*      INPUT */
424 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
425 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
426 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
427 /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
428 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
429
430 /*      OUTPUT */
431 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
432 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
433 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
434 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
435 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
436
437 /*      INPUT-ONLY */
438 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
439 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
440 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
441 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
442 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
443 };
444
445 static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
446 {
447         int on = (flags & 1);           /* secure_tcp */
448
449         /*
450         ** FIXME: change secure_tcp to independent sysctl var
451         ** or make it per-service or per-app because it is valid
452         ** for most if not for all of the applications. Something
453         ** like "capabilities" (flags) for each object.
454         */
455         pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
456 }
457
458 static inline int tcp_state_idx(struct tcphdr *th)
459 {
460         if (th->rst)
461                 return 3;
462         if (th->syn)
463                 return 0;
464         if (th->fin)
465                 return 1;
466         if (th->ack)
467                 return 2;
468         return -1;
469 }
470
471 static inline void
472 set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
473               int direction, struct tcphdr *th)
474 {
475         int state_idx;
476         int new_state = IP_VS_TCP_S_CLOSE;
477         int state_off = tcp_state_off[direction];
478
479         /*
480          *    Update state offset to INPUT_ONLY if necessary
481          *    or delete NO_OUTPUT flag if output packet detected
482          */
483         if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
484                 if (state_off == TCP_DIR_OUTPUT)
485                         cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
486                 else
487                         state_off = TCP_DIR_INPUT_ONLY;
488         }
489
490         if ((state_idx = tcp_state_idx(th)) < 0) {
491                 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
492                 goto tcp_state_out;
493         }
494
495         new_state =
496                 pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
497
498   tcp_state_out:
499         if (new_state != cp->state) {
500                 struct ip_vs_dest *dest = cp->dest;
501
502                 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
503                               "%s:%d state: %s->%s conn->refcnt:%d\n",
504                               pd->pp->name,
505                               ((state_off == TCP_DIR_OUTPUT) ?
506                                "output " : "input "),
507                               th->syn ? 'S' : '.',
508                               th->fin ? 'F' : '.',
509                               th->ack ? 'A' : '.',
510                               th->rst ? 'R' : '.',
511                               IP_VS_DBG_ADDR(cp->af, &cp->daddr),
512                               ntohs(cp->dport),
513                               IP_VS_DBG_ADDR(cp->af, &cp->caddr),
514                               ntohs(cp->cport),
515                               tcp_state_name(cp->state),
516                               tcp_state_name(new_state),
517                               atomic_read(&cp->refcnt));
518
519                 if (dest) {
520                         if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
521                             (new_state != IP_VS_TCP_S_ESTABLISHED)) {
522                                 atomic_dec(&dest->activeconns);
523                                 atomic_inc(&dest->inactconns);
524                                 cp->flags |= IP_VS_CONN_F_INACTIVE;
525                         } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
526                                    (new_state == IP_VS_TCP_S_ESTABLISHED)) {
527                                 atomic_inc(&dest->activeconns);
528                                 atomic_dec(&dest->inactconns);
529                                 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
530                         }
531                 }
532         }
533
534         if (likely(pd))
535                 cp->timeout = pd->timeout_table[cp->state = new_state];
536         else    /* What to do ? */
537                 cp->timeout = tcp_timeouts[cp->state = new_state];
538 }
539
540 /*
541  *      Handle state transitions
542  */
543 static void
544 tcp_state_transition(struct ip_vs_conn *cp, int direction,
545                      const struct sk_buff *skb,
546                      struct ip_vs_proto_data *pd)
547 {
548         struct tcphdr _tcph, *th;
549
550 #ifdef CONFIG_IP_VS_IPV6
551         int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
552 #else
553         int ihl = ip_hdrlen(skb);
554 #endif
555
556         th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
557         if (th == NULL)
558                 return;
559
560         spin_lock_bh(&cp->lock);
561         set_tcp_state(pd, cp, direction, th);
562         spin_unlock_bh(&cp->lock);
563 }
564
565 static inline __u16 tcp_app_hashkey(__be16 port)
566 {
567         return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
568                 & TCP_APP_TAB_MASK;
569 }
570
571
572 static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
573 {
574         struct ip_vs_app *i;
575         __u16 hash;
576         __be16 port = inc->port;
577         int ret = 0;
578         struct netns_ipvs *ipvs = net_ipvs(net);
579         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
580
581         hash = tcp_app_hashkey(port);
582
583         list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
584                 if (i->port == port) {
585                         ret = -EEXIST;
586                         goto out;
587                 }
588         }
589         list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
590         atomic_inc(&pd->appcnt);
591
592   out:
593         return ret;
594 }
595
596
597 static void
598 tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
599 {
600         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
601
602         atomic_dec(&pd->appcnt);
603         list_del_rcu(&inc->p_list);
604 }
605
606
607 static int
608 tcp_app_conn_bind(struct ip_vs_conn *cp)
609 {
610         struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
611         int hash;
612         struct ip_vs_app *inc;
613         int result = 0;
614
615         /* Default binding: bind app only for NAT */
616         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
617                 return 0;
618
619         /* Lookup application incarnations and bind the right one */
620         hash = tcp_app_hashkey(cp->vport);
621
622         rcu_read_lock();
623         list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
624                 if (inc->port == cp->vport) {
625                         if (unlikely(!ip_vs_app_inc_get(inc)))
626                                 break;
627                         rcu_read_unlock();
628
629                         IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
630                                       "%s:%u to app %s on port %u\n",
631                                       __func__,
632                                       IP_VS_DBG_ADDR(cp->af, &cp->caddr),
633                                       ntohs(cp->cport),
634                                       IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
635                                       ntohs(cp->vport),
636                                       inc->name, ntohs(inc->port));
637
638                         cp->app = inc;
639                         if (inc->init_conn)
640                                 result = inc->init_conn(inc, cp);
641                         goto out;
642                 }
643         }
644         rcu_read_unlock();
645
646   out:
647         return result;
648 }
649
650
651 /*
652  *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
653  */
654 void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
655 {
656         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
657
658         spin_lock_bh(&cp->lock);
659         cp->state = IP_VS_TCP_S_LISTEN;
660         cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
661                            : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
662         spin_unlock_bh(&cp->lock);
663 }
664
665 /* ---------------------------------------------
666  *   timeouts is netns related now.
667  * ---------------------------------------------
668  */
669 static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
670 {
671         struct netns_ipvs *ipvs = net_ipvs(net);
672
673         ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
674         pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
675                                                         sizeof(tcp_timeouts));
676         if (!pd->timeout_table)
677                 return -ENOMEM;
678         pd->tcp_state_table =  tcp_states;
679         return 0;
680 }
681
682 static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
683 {
684         kfree(pd->timeout_table);
685 }
686
687
688 struct ip_vs_protocol ip_vs_protocol_tcp = {
689         .name =                 "TCP",
690         .protocol =             IPPROTO_TCP,
691         .num_states =           IP_VS_TCP_S_LAST,
692         .dont_defrag =          0,
693         .init =                 NULL,
694         .exit =                 NULL,
695         .init_netns =           __ip_vs_tcp_init,
696         .exit_netns =           __ip_vs_tcp_exit,
697         .register_app =         tcp_register_app,
698         .unregister_app =       tcp_unregister_app,
699         .conn_schedule =        tcp_conn_schedule,
700         .conn_in_get =          ip_vs_conn_in_get_proto,
701         .conn_out_get =         ip_vs_conn_out_get_proto,
702         .snat_handler =         tcp_snat_handler,
703         .dnat_handler =         tcp_dnat_handler,
704         .csum_check =           tcp_csum_check,
705         .state_name =           tcp_state_name,
706         .state_transition =     tcp_state_transition,
707         .app_conn_bind =        tcp_app_conn_bind,
708         .debug_packet =         ip_vs_tcpudp_debug_packet,
709         .timeout_change =       tcp_timeout_change,
710 };