+++ /dev/null
-From: Felix Fietkau <nbd@nbd.name>
-Date: Thu, 15 Mar 2018 18:21:43 +0100
-Subject: [PATCH] netfilter: nf_flow_table: clean up and fix dst handling
-
-dst handling in the code is inconsistent and possibly wrong. In my test,
-skb_dst(skb) holds the dst entry after routing but before NAT, so the
-code could possibly return the same dst entry for both directions of a
-connection.
-Additionally, there was some confusion over the dst entry vs the address
-passed as parameter to rt_nexthop/rt6_nexthop.
-
-Do an explicit dst lookup for both ends of the connection and always use
-the source address for it. When running the IP hook, use the dst entry
-for the opposite direction for determining the route.
-
-Signed-off-by: Felix Fietkau <nbd@nbd.name>
----
-
---- a/net/netfilter/nf_flow_table_ip.c
-+++ b/net/netfilter/nf_flow_table_ip.c
-@@ -238,7 +238,7 @@ nf_flow_offload_ip_hook(void *priv, stru
-
- dir = tuplehash->tuple.dir;
- flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-- rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
-+ rt = (const struct rtable *)flow->tuplehash[!dir].tuple.dst_cache;
-
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
- (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
-@@ -455,7 +455,7 @@ nf_flow_offload_ipv6_hook(void *priv, st
-
- dir = tuplehash->tuple.dir;
- flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-- rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
-+ rt = (struct rt6_info *)flow->tuplehash[!dir].tuple.dst_cache;
-
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
- return NF_ACCEPT;
---- a/net/netfilter/nft_flow_offload.c
-+++ b/net/netfilter/nft_flow_offload.c
-@@ -17,27 +17,38 @@ struct nft_flow_offload {
- struct nft_flowtable *flowtable;
- };
-
--static int nft_flow_route(const struct nft_pktinfo *pkt,
-- const struct nf_conn *ct,
-- struct nf_flow_route *route,
-- enum ip_conntrack_dir dir)
-+static struct dst_entry *
-+nft_flow_dst(const struct nf_conn *ct, enum ip_conntrack_dir dir,
-+ const struct nft_pktinfo *pkt)
- {
-- struct dst_entry *this_dst = skb_dst(pkt->skb);
-- struct dst_entry *other_dst = NULL;
-+ struct dst_entry *dst;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- switch (nft_pf(pkt)) {
- case NFPROTO_IPV4:
-- fl.u.ip4.daddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
-+ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
- break;
- case NFPROTO_IPV6:
-- fl.u.ip6.daddr = ct->tuplehash[!dir].tuple.dst.u3.in6;
-+ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
- break;
- }
-
-- nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
-- if (!other_dst)
-+ nf_route(nft_net(pkt), &dst, &fl, false, nft_pf(pkt));
-+
-+ return dst;
-+}
-+
-+static int nft_flow_route(const struct nft_pktinfo *pkt,
-+ const struct nf_conn *ct,
-+ struct nf_flow_route *route,
-+ enum ip_conntrack_dir dir)
-+{
-+ struct dst_entry *this_dst, *other_dst;
-+
-+ this_dst = nft_flow_dst(ct, dir, pkt);
-+ other_dst = nft_flow_dst(ct, !dir, pkt);
-+ if (!this_dst || !other_dst)
- return -ENOENT;
-
- route->tuple[dir].dst = this_dst;
--- /dev/null
+From: wenxu <wenxu@ucloud.cn>
+Date: Wed, 9 Jan 2019 10:40:11 +0800
+Subject: [PATCH] netfilter: nft_flow_offload: Fix reverse route lookup
+
+Using the following example:
+
+ client 1.1.1.7 ---> 2.2.2.7 which dnat to 10.0.0.7 server
+
+The first reply packet (ie. syn+ack) uses an incorrect destination
+address for the reverse route lookup since it uses:
+
+ daddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
+
+which is 2.2.2.7 in the scenario that is described above, while this
+should be:
+
+ daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+
+that is 10.0.0.7.
+
+Signed-off-by: wenxu <wenxu@ucloud.cn>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+---
+
+--- a/net/netfilter/nft_flow_offload.c
++++ b/net/netfilter/nft_flow_offload.c
+@@ -29,10 +29,10 @@ static int nft_flow_route(const struct n
+ memset(&fl, 0, sizeof(fl));
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+- fl.u.ip4.daddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
++ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ break;
+ case NFPROTO_IPV6:
+- fl.u.ip6.daddr = ct->tuplehash[!dir].tuple.dst.u3.in6;
++ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ break;
+ }
+
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-- rt = (const struct rtable *)flow->tuplehash[!dir].tuple.dst_cache;
-+ rt = (struct rtable *)flow->tuplehash[!dir].tuple.dst_cache;
+- rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
++ rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
(ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
--- /dev/null
+From: wenxu <wenxu@ucloud.cn>
+Date: Thu, 10 Jan 2019 14:51:35 +0800
+Subject: [PATCH] netfilter: nft_flow_offload: fix interaction with vrf slave
+ device
+
+In the forward chain, the iif is changed from slave device to master vrf
+device. Thus, flow offload does not find a match on the lower slave
+device.
+
+This patch uses the cached route, ie. dst->dev, to update the iif and
+oif fields in the flow entry.
+
+After this patch, the following example works fine:
+
+ # ip addr add dev eth0 1.1.1.1/24
+ # ip addr add dev eth1 10.0.0.1/24
+ # ip link add user1 type vrf table 1
+ # ip l set user1 up
+ # ip l set dev eth0 master user1
+ # ip l set dev eth1 master user1
+
+ # nft add table firewall
+ # nft add flowtable f fb1 { hook ingress priority 0 \; devices = { eth0, eth1 } \; }
+ # nft add chain f ftb-all {type filter hook forward priority 0 \; policy accept \; }
+ # nft add rule f ftb-all ct zone 1 ip protocol tcp flow offload @fb1
+ # nft add rule f ftb-all ct zone 1 ip protocol udp flow offload @fb1
+
+Signed-off-by: wenxu <wenxu@ucloud.cn>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+---
+
+--- a/include/net/netfilter/nf_flow_table.h
++++ b/include/net/netfilter/nf_flow_table.h
+@@ -84,7 +84,6 @@ struct flow_offload {
+ struct nf_flow_route {
+ struct {
+ struct dst_entry *dst;
+- int ifindex;
+ } tuple[FLOW_OFFLOAD_DIR_MAX];
+ };
+
+--- a/net/netfilter/nf_flow_table_core.c
++++ b/net/netfilter/nf_flow_table_core.c
+@@ -28,6 +28,7 @@ flow_offload_fill_dir(struct flow_offloa
+ {
+ struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
+ struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
++ struct dst_entry *other_dst = route->tuple[!dir].dst;
+ struct dst_entry *dst = route->tuple[dir].dst;
+
+ ft->dir = dir;
+@@ -50,8 +51,8 @@ flow_offload_fill_dir(struct flow_offloa
+ ft->src_port = ctt->src.u.tcp.port;
+ ft->dst_port = ctt->dst.u.tcp.port;
+
+- ft->iifidx = route->tuple[dir].ifindex;
+- ft->oifidx = route->tuple[!dir].ifindex;
++ ft->iifidx = other_dst->dev->ifindex;
++ ft->oifidx = dst->dev->ifindex;
+ ft->dst_cache = dst;
+ }
+
+--- a/net/netfilter/nft_flow_offload.c
++++ b/net/netfilter/nft_flow_offload.c
+@@ -30,9 +30,11 @@ static int nft_flow_route(const struct n
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
++ fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
++ fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
+ break;
+ }
+
+@@ -41,9 +43,7 @@ static int nft_flow_route(const struct n
+ return -ENOENT;
+
+ route->tuple[dir].dst = this_dst;
+- route->tuple[dir].ifindex = nft_in(pkt)->ifindex;
+ route->tuple[!dir].dst = other_dst;
+- route->tuple[!dir].ifindex = nft_out(pkt)->ifindex;
+
+ return 0;
+ }
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
-@@ -164,6 +164,8 @@ struct nf_flow_table_hw {
+@@ -163,6 +163,8 @@ struct nf_flow_table_hw {
int nf_flow_table_hw_register(const struct nf_flow_table_hw *offload);
void nf_flow_table_hw_unregister(const struct nf_flow_table_hw *offload);
struct flow_offload_entry {
struct flow_offload flow;
-@@ -151,6 +152,22 @@ void flow_offload_free(struct flow_offlo
+@@ -152,6 +153,22 @@ void flow_offload_free(struct flow_offlo
}
EXPORT_SYMBOL_GPL(flow_offload_free);
obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
--- /dev/null
+++ b/net/netfilter/xt_FLOWOFFLOAD.c
-@@ -0,0 +1,408 @@
+@@ -0,0 +1,421 @@
+/*
+ * Copyright (C) 2018 Felix Fietkau <nbd@nbd.name>
+ *
+#include <linux/netfilter/xt_FLOWOFFLOAD.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_conntrack.h>
-+#include <net/netfilter/nf_flow_table.h>
++#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_helper.h>
++#include <net/netfilter/nf_flow_table.h>
+
+static struct nf_flowtable nf_flowtable;
+static HLIST_HEAD(hooks);
+}
+
+static bool
-+xt_flowoffload_skip(struct sk_buff *skb)
++xt_flowoffload_skip(struct sk_buff *skb, int family)
+{
-+ struct ip_options *opt = &(IPCB(skb)->opt);
-+
-+ if (unlikely(opt->optlen))
-+ return true;
+ if (skb_sec_path(skb))
+ return true;
+
++ if (family == NFPROTO_IPV4) {
++ const struct ip_options *opt = &(IPCB(skb)->opt);
++
++ if (unlikely(opt->optlen))
++ return true;
++ }
++
+ return false;
+}
+
+static struct dst_entry *
+xt_flowoffload_dst(const struct nf_conn *ct, enum ip_conntrack_dir dir,
-+ const struct xt_action_param *par)
++ const struct xt_action_param *par, int ifindex)
+{
+ struct dst_entry *dst = NULL;
+ struct flowi fl;
+ switch (xt_family(par)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
++ fl.u.ip4.flowi4_oif = ifindex;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.saddr = ct->tuplehash[dir].tuple.dst.u3.in6;
+ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
++ fl.u.ip6.flowi6_oif = ifindex;
+ break;
+ }
+
+{
+ struct dst_entry *this_dst, *other_dst;
+
-+ this_dst = xt_flowoffload_dst(ct, dir, par);
-+ other_dst = xt_flowoffload_dst(ct, !dir, par);
++ this_dst = xt_flowoffload_dst(ct, !dir, par, xt_out(par)->ifindex);
++ other_dst = xt_flowoffload_dst(ct, dir, par, xt_in(par)->ifindex);
+ if (!this_dst || !other_dst)
+ return -ENOENT;
+
+ return -EINVAL;
+
+ route->tuple[dir].dst = this_dst;
-+ route->tuple[dir].ifindex = xt_in(par)->ifindex;
+ route->tuple[!dir].dst = other_dst;
-+ route->tuple[!dir].ifindex = xt_out(par)->ifindex;
+
+ return 0;
+}
+flowoffload_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_flowoffload_target_info *info = par->targinfo;
-+ const struct nf_conn_help *help;
++ struct tcphdr _tcph, *tcph = NULL;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ struct nf_flow_route route;
+ struct nf_conn *ct;
+ struct net *net;
+
-+ if (xt_flowoffload_skip(skb))
++ if (xt_flowoffload_skip(skb, xt_family(par)))
+ return XT_CONTINUE;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ case IPPROTO_TCP:
+ if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
+ return XT_CONTINUE;
++
++ tcph = skb_header_pointer(skb, par->thoff,
++ sizeof(_tcph), &_tcph);
++ if (unlikely(!tcph || tcph->fin || tcph->rst))
++ return XT_CONTINUE;
+ break;
+ case IPPROTO_UDP:
+ break;
+ return XT_CONTINUE;
+ }
+
-+ help = nfct_help(ct);
-+ if (help)
++ if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
++ ct->status & IPS_SEQ_ADJUST)
+ return XT_CONTINUE;
+
-+ if (ctinfo == IP_CT_NEW ||
-+ ctinfo == IP_CT_RELATED)
++ if (!nf_ct_is_confirmed(ct))
+ return XT_CONTINUE;
+
+ if (!xt_in(par) || !xt_out(par))
+ if (!flow)
+ goto err_flow_alloc;
+
++ if (tcph) {
++ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
++ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
++ }
++
+ if (flow_offload_add(&nf_flowtable, flow) < 0)
+ goto err_flow_add;
+
struct flow_offload {
struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX];
-@@ -126,6 +133,22 @@ unsigned int nf_flow_offload_ip_hook(voi
+@@ -125,6 +132,22 @@ unsigned int nf_flow_offload_ip_hook(voi
unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state);
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
-@@ -218,10 +218,16 @@ int flow_offload_add(struct nf_flowtable
+@@ -219,10 +219,16 @@ int flow_offload_add(struct nf_flowtable
}
EXPORT_SYMBOL_GPL(flow_offload_add);
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
-@@ -236,6 +242,9 @@ static void flow_offload_del(struct nf_f
+@@ -237,6 +243,9 @@ static void flow_offload_del(struct nf_f
if (!(flow->flags & FLOW_OFFLOAD_TEARDOWN))
flow_offload_fixup_ct_state(e->ct);
flow_offload_free(flow);
}
-@@ -349,6 +358,9 @@ static int nf_flow_offload_gc_step(struc
+@@ -350,6 +359,9 @@ static int nf_flow_offload_gc_step(struc
if (!teardown)
nf_ct_offload_timeout(flow);
if (nf_flow_has_expired(flow) || teardown)
flow_offload_del(flow_table, flow);
}
-@@ -484,10 +496,43 @@ int nf_flow_dnat_port(const struct flow_
+@@ -485,10 +497,43 @@ int nf_flow_dnat_port(const struct flow_
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
err = rhashtable_init(&flowtable->rhashtable,
-@@ -525,6 +570,8 @@ static void nf_flow_table_iterate_cleanu
+@@ -526,6 +571,8 @@ static void nf_flow_table_iterate_cleanu
{
nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
flush_delayed_work(&flowtable->gc_work);
}
void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
-@@ -538,6 +585,26 @@ void nf_flow_table_cleanup(struct net *n
+@@ -539,6 +586,26 @@ void nf_flow_table_cleanup(struct net *n
}
EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
void nf_flow_table_free(struct nf_flowtable *flow_table)
{
mutex_lock(&flowtable_lock);
-@@ -547,9 +614,58 @@ void nf_flow_table_free(struct nf_flowta
+@@ -548,9 +615,58 @@ void nf_flow_table_free(struct nf_flowta
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
WARN_ON(!nf_flow_offload_gc_step(flow_table));
rhashtable_destroy(&flow_table->rhashtable);
nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
-@@ -121,6 +121,9 @@ static void nft_flow_offload_eval(const
+@@ -110,6 +110,9 @@ static void nft_flow_offload_eval(const
if (ret < 0)
goto err_flow_add;
struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX];
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
-@@ -358,7 +358,7 @@ static int nf_flow_offload_gc_step(struc
+@@ -359,7 +359,7 @@ static int nf_flow_offload_gc_step(struc
if (!teardown)
nf_ct_offload_timeout(flow);