v1.5 branch refresh based upon upstream master @ c8677ca89e53e3be7988d54280fce166cc894a7e
[librecmc/librecmc.git] / target / linux / generic / backport-4.14 / 323-v4.16-netfilter-flow-table-support-for-IPv4.patch
1 From: Pablo Neira Ayuso <pablo@netfilter.org>
2 Date: Sun, 7 Jan 2018 01:04:15 +0100
3 Subject: [PATCH] netfilter: flow table support for IPv4
4
5 This patch adds the IPv4 flow table type, that implements the datapath
6 flow table to forward IPv4 traffic. Rationale is:
7
8 1) Look up for the packet in the flow table, from the ingress hook.
9 2) If there's a hit, decrement ttl and pass it on to the neighbour layer
10    for transmission.
11 3) If there's a miss, packet is passed up to the classic forwarding
12    path.
13
14 This patch also supports layer 3 source and destination NAT.
15
16 Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
17 ---
18  create mode 100644 net/ipv4/netfilter/nf_flow_table_ipv4.c
19
20 --- a/net/ipv4/netfilter/Kconfig
21 +++ b/net/ipv4/netfilter/Kconfig
22 @@ -77,6 +77,14 @@ config NF_TABLES_ARP
23  
24  endif # NF_TABLES
25  
26 +config NF_FLOW_TABLE_IPV4
27 +       select NF_FLOW_TABLE
28 +       tristate "Netfilter flow table IPv4 module"
29 +       help
30 +         This option adds the flow table IPv4 support.
31 +
32 +         To compile it as a module, choose M here.
33 +
34  config NF_DUP_IPV4
35         tristate "Netfilter IPv4 packet duplication to alternate destination"
36         depends on !NF_CONNTRACK || NF_CONNTRACK
37 --- a/net/ipv4/netfilter/Makefile
38 +++ b/net/ipv4/netfilter/Makefile
39 @@ -43,6 +43,9 @@ obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redi
40  obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
41  obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
42  
43 +# flow table support
44 +obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o
45 +
46  # generic IP tables 
47  obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
48  
49 --- /dev/null
50 +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
51 @@ -0,0 +1,283 @@
52 +#include <linux/kernel.h>
53 +#include <linux/init.h>
54 +#include <linux/module.h>
55 +#include <linux/netfilter.h>
56 +#include <linux/rhashtable.h>
57 +#include <linux/ip.h>
58 +#include <linux/netdevice.h>
59 +#include <net/ip.h>
60 +#include <net/neighbour.h>
61 +#include <net/netfilter/nf_flow_table.h>
62 +#include <net/netfilter/nf_tables.h>
63 +/* For layer 4 checksum field offset. */
64 +#include <linux/tcp.h>
65 +#include <linux/udp.h>
66 +
67 +static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
68 +                             __be32 addr, __be32 new_addr)
69 +{
70 +       struct tcphdr *tcph;
71 +
72 +       if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
73 +           skb_try_make_writable(skb, thoff + sizeof(*tcph)))
74 +               return -1;
75 +
76 +       tcph = (void *)(skb_network_header(skb) + thoff);
77 +       inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
78 +
79 +       return 0;
80 +}
81 +
82 +static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
83 +                             __be32 addr, __be32 new_addr)
84 +{
85 +       struct udphdr *udph;
86 +
87 +       if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
88 +           skb_try_make_writable(skb, thoff + sizeof(*udph)))
89 +               return -1;
90 +
91 +       udph = (void *)(skb_network_header(skb) + thoff);
92 +       if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
93 +               inet_proto_csum_replace4(&udph->check, skb, addr,
94 +                                        new_addr, true);
95 +               if (!udph->check)
96 +                       udph->check = CSUM_MANGLED_0;
97 +       }
98 +
99 +       return 0;
100 +}
101 +
102 +static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
103 +                                 unsigned int thoff, __be32 addr,
104 +                                 __be32 new_addr)
105 +{
106 +       switch (iph->protocol) {
107 +       case IPPROTO_TCP:
108 +               if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
109 +                       return NF_DROP;
110 +               break;
111 +       case IPPROTO_UDP:
112 +               if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
113 +                       return NF_DROP;
114 +               break;
115 +       }
116 +
117 +       return 0;
118 +}
119 +
120 +static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
121 +                          struct iphdr *iph, unsigned int thoff,
122 +                          enum flow_offload_tuple_dir dir)
123 +{
124 +       __be32 addr, new_addr;
125 +
126 +       switch (dir) {
127 +       case FLOW_OFFLOAD_DIR_ORIGINAL:
128 +               addr = iph->saddr;
129 +               new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
130 +               iph->saddr = new_addr;
131 +               break;
132 +       case FLOW_OFFLOAD_DIR_REPLY:
133 +               addr = iph->daddr;
134 +               new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
135 +               iph->daddr = new_addr;
136 +               break;
137 +       default:
138 +               return -1;
139 +       }
140 +       csum_replace4(&iph->check, addr, new_addr);
141 +
142 +       return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
143 +}
144 +
145 +static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
146 +                          struct iphdr *iph, unsigned int thoff,
147 +                          enum flow_offload_tuple_dir dir)
148 +{
149 +       __be32 addr, new_addr;
150 +
151 +       switch (dir) {
152 +       case FLOW_OFFLOAD_DIR_ORIGINAL:
153 +               addr = iph->daddr;
154 +               new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
155 +               iph->daddr = new_addr;
156 +               break;
157 +       case FLOW_OFFLOAD_DIR_REPLY:
158 +               addr = iph->saddr;
159 +               new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
160 +               iph->saddr = new_addr;
161 +               break;
162 +       default:
163 +               return -1;
164 +       }
165 +
166 +       return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
167 +}
168 +
169 +static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
170 +                         enum flow_offload_tuple_dir dir)
171 +{
172 +       struct iphdr *iph = ip_hdr(skb);
173 +       unsigned int thoff = iph->ihl * 4;
174 +
175 +       if (flow->flags & FLOW_OFFLOAD_SNAT &&
176 +           (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
177 +            nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
178 +               return -1;
179 +       if (flow->flags & FLOW_OFFLOAD_DNAT &&
180 +           (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
181 +            nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
182 +               return -1;
183 +
184 +       return 0;
185 +}
186 +
187 +static bool ip_has_options(unsigned int thoff)
188 +{
189 +       return thoff != sizeof(struct iphdr);
190 +}
191 +
192 +static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
193 +                           struct flow_offload_tuple *tuple)
194 +{
195 +       struct flow_ports *ports;
196 +       unsigned int thoff;
197 +       struct iphdr *iph;
198 +
199 +       if (!pskb_may_pull(skb, sizeof(*iph)))
200 +               return -1;
201 +
202 +       iph = ip_hdr(skb);
203 +       thoff = iph->ihl * 4;
204 +
205 +       if (ip_is_fragment(iph) ||
206 +           unlikely(ip_has_options(thoff)))
207 +               return -1;
208 +
209 +       if (iph->protocol != IPPROTO_TCP &&
210 +           iph->protocol != IPPROTO_UDP)
211 +               return -1;
212 +
213 +       thoff = iph->ihl * 4;
214 +       if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
215 +               return -1;
216 +
217 +       ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
218 +
219 +       tuple->src_v4.s_addr    = iph->saddr;
220 +       tuple->dst_v4.s_addr    = iph->daddr;
221 +       tuple->src_port         = ports->source;
222 +       tuple->dst_port         = ports->dest;
223 +       tuple->l3proto          = AF_INET;
224 +       tuple->l4proto          = iph->protocol;
225 +       tuple->iifidx           = dev->ifindex;
226 +
227 +       return 0;
228 +}
229 +
230 +/* Based on ip_exceeds_mtu(). */
231 +static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
232 +{
233 +       if (skb->len <= mtu)
234 +               return false;
235 +
236 +       if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
237 +               return false;
238 +
239 +       if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
240 +               return false;
241 +
242 +       return true;
243 +}
244 +
245 +static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
246 +{
247 +       u32 mtu;
248 +
249 +       mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
250 +       if (__nf_flow_exceeds_mtu(skb, mtu))
251 +               return true;
252 +
253 +       return false;
254 +}
255 +
256 +static unsigned int
257 +nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
258 +                       const struct nf_hook_state *state)
259 +{
260 +       struct flow_offload_tuple_rhash *tuplehash;
261 +       struct nf_flowtable *flow_table = priv;
262 +       struct flow_offload_tuple tuple = {};
263 +       enum flow_offload_tuple_dir dir;
264 +       struct flow_offload *flow;
265 +       struct net_device *outdev;
266 +       const struct rtable *rt;
267 +       struct iphdr *iph;
268 +       __be32 nexthop;
269 +
270 +       if (skb->protocol != htons(ETH_P_IP))
271 +               return NF_ACCEPT;
272 +
273 +       if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
274 +               return NF_ACCEPT;
275 +
276 +       tuplehash = flow_offload_lookup(flow_table, &tuple);
277 +       if (tuplehash == NULL)
278 +               return NF_ACCEPT;
279 +
280 +       outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
281 +       if (!outdev)
282 +               return NF_ACCEPT;
283 +
284 +       dir = tuplehash->tuple.dir;
285 +       flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
286 +
287 +       rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
288 +       if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
289 +               return NF_ACCEPT;
290 +
291 +       if (skb_try_make_writable(skb, sizeof(*iph)))
292 +               return NF_DROP;
293 +
294 +       if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
295 +           nf_flow_nat_ip(flow, skb, dir) < 0)
296 +               return NF_DROP;
297 +
298 +       flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
299 +       iph = ip_hdr(skb);
300 +       ip_decrease_ttl(iph);
301 +
302 +       skb->dev = outdev;
303 +       nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
304 +       neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
305 +
306 +       return NF_STOLEN;
307 +}
308 +
309 +static struct nf_flowtable_type flowtable_ipv4 = {
310 +       .family         = NFPROTO_IPV4,
311 +       .params         = &nf_flow_offload_rhash_params,
312 +       .gc             = nf_flow_offload_work_gc,
313 +       .hook           = nf_flow_offload_ip_hook,
314 +       .owner          = THIS_MODULE,
315 +};
316 +
317 +static int __init nf_flow_ipv4_module_init(void)
318 +{
319 +       nft_register_flowtable_type(&flowtable_ipv4);
320 +
321 +       return 0;
322 +}
323 +
324 +static void __exit nf_flow_ipv4_module_exit(void)
325 +{
326 +       nft_unregister_flowtable_type(&flowtable_ipv4);
327 +}
328 +
329 +module_init(nf_flow_ipv4_module_init);
330 +module_exit(nf_flow_ipv4_module_exit);
331 +
332 +MODULE_LICENSE("GPL");
333 +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
334 +MODULE_ALIAS_NF_FLOWTABLE(AF_INET);