7788dd14349942f36464b95dc2209522a9a4ebff
[librecmc/librecmc.git] /
1 From: Pablo Neira Ayuso <pablo@netfilter.org>
2 Date: Sun, 7 Jan 2018 01:03:56 +0100
3 Subject: [PATCH] netfilter: nf_conntrack: add IPS_OFFLOAD status bit
4
5 This new bit tells us that the conntrack entry is owned by the flow
6 table offload infrastructure.
7
8  # cat /proc/net/nf_conntrack
9  ipv4     2 tcp      6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2
10
11 Note the [OFFLOAD] tag in the listing.
12
13 The timer of such conntrack entries look like stopped from userspace.
14 In practise, to make sure the conntrack entry does not go away, the
15 conntrack timer is periodically set to an arbitrary large value that
16 gets refreshed on every iteration from the garbage collector, so it
17 never expires- and they display no internal state in the case of TCP
18 flows. This allows us to save a bitcheck from the packet path via
19 nf_ct_is_expired().
20
21 Conntrack entries that have been offloaded to the flow table
22 infrastructure cannot be deleted/flushed via ctnetlink. The flow table
23 infrastructure is also responsible for releasing this conntrack entry.
24
25 Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
26 ---
27
28 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h
29 +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
30 @@ -101,12 +101,16 @@ enum ip_conntrack_status {
31         IPS_HELPER_BIT = 13,
32         IPS_HELPER = (1 << IPS_HELPER_BIT),
33  
34 +       /* Conntrack has been offloaded to flow table. */
35 +       IPS_OFFLOAD_BIT = 14,
36 +       IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT),
37 +
38         /* Be careful here, modifying these bits can make things messy,
39          * so don't let users modify them directly.
40          */
41         IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
42                                  IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
43 -                                IPS_SEQ_ADJUST | IPS_TEMPLATE),
44 +                                IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
45  
46         __IPS_MAX_BIT = 14,
47  };
48 --- a/net/netfilter/nf_conntrack_core.c
49 +++ b/net/netfilter/nf_conntrack_core.c
50 @@ -901,6 +901,9 @@ static unsigned int early_drop_list(stru
51         hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
52                 tmp = nf_ct_tuplehash_to_ctrack(h);
53  
54 +               if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
55 +                       continue;
56 +
57                 if (nf_ct_is_expired(tmp)) {
58                         nf_ct_gc_expired(tmp);
59                         continue;
60 @@ -978,6 +981,18 @@ static bool gc_worker_can_early_drop(con
61         return false;
62  }
63  
64 +#define        DAY     (86400 * HZ)
65 +
66 +/* Set an arbitrary timeout large enough not to ever expire, this save
67 + * us a check for the IPS_OFFLOAD_BIT from the packet path via
68 + * nf_ct_is_expired().
69 + */
70 +static void nf_ct_offload_timeout(struct nf_conn *ct)
71 +{
72 +       if (nf_ct_expires(ct) < DAY / 2)
73 +               ct->timeout = nfct_time_stamp + DAY;
74 +}
75 +
76  static void gc_worker(struct work_struct *work)
77  {
78         unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
79 @@ -1014,6 +1029,11 @@ static void gc_worker(struct work_struct
80                         tmp = nf_ct_tuplehash_to_ctrack(h);
81  
82                         scanned++;
83 +                       if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
84 +                               nf_ct_offload_timeout(tmp);
85 +                               continue;
86 +                       }
87 +
88                         if (nf_ct_is_expired(tmp)) {
89                                 nf_ct_gc_expired(tmp);
90                                 expired_count++;
91 --- a/net/netfilter/nf_conntrack_netlink.c
92 +++ b/net/netfilter/nf_conntrack_netlink.c
93 @@ -1120,6 +1120,14 @@ static const struct nla_policy ct_nla_po
94                                     .len = NF_CT_LABELS_MAX_SIZE },
95  };
96  
97 +static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
98 +{
99 +       if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
100 +               return 0;
101 +
102 +       return ctnetlink_filter_match(ct, data);
103 +}
104 +
105  static int ctnetlink_flush_conntrack(struct net *net,
106                                      const struct nlattr * const cda[],
107                                      u32 portid, int report)
108 @@ -1132,7 +1140,7 @@ static int ctnetlink_flush_conntrack(str
109                         return PTR_ERR(filter);
110         }
111  
112 -       nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter,
113 +       nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
114                                   portid, report);
115         kfree(filter);
116  
117 @@ -1178,6 +1186,11 @@ static int ctnetlink_del_conntrack(struc
118  
119         ct = nf_ct_tuplehash_to_ctrack(h);
120  
121 +       if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
122 +               nf_ct_put(ct);
123 +               return -EBUSY;
124 +       }
125 +
126         if (cda[CTA_ID]) {
127                 u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
128                 if (id != (u32)(unsigned long)ct) {
129 --- a/net/netfilter/nf_conntrack_proto_tcp.c
130 +++ b/net/netfilter/nf_conntrack_proto_tcp.c
131 @@ -305,6 +305,9 @@ static bool tcp_invert_tuple(struct nf_c
132  /* Print out the private part of the conntrack. */
133  static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
134  {
135 +       if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
136 +               return;
137 +
138         seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
139  }
140  #endif
141 --- a/net/netfilter/nf_conntrack_standalone.c
142 +++ b/net/netfilter/nf_conntrack_standalone.c
143 @@ -309,10 +309,12 @@ static int ct_seq_show(struct seq_file *
144         WARN_ON(!l4proto);
145  
146         ret = -ENOSPC;
147 -       seq_printf(s, "%-8s %u %-8s %u %ld ",
148 +       seq_printf(s, "%-8s %u %-8s %u ",
149                    l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
150 -                  l4proto_name(l4proto->l4proto), nf_ct_protonum(ct),
151 -                  nf_ct_expires(ct)  / HZ);
152 +                  l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
153 +
154 +       if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
155 +               seq_printf(s, "%ld ", nf_ct_expires(ct)  / HZ);
156  
157         if (l4proto->print_conntrack)
158                 l4proto->print_conntrack(s, ct);
159 @@ -339,7 +341,9 @@ static int ct_seq_show(struct seq_file *
160         if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
161                 goto release;
162  
163 -       if (test_bit(IPS_ASSURED_BIT, &ct->status))
164 +       if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
165 +               seq_puts(s, "[OFFLOAD] ");
166 +       else if (test_bit(IPS_ASSURED_BIT, &ct->status))
167                 seq_puts(s, "[ASSURED] ");
168  
169         if (seq_has_overflowed(s))