net/sched/sch_tbf.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   4  *
   5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6  *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
   7  *                                               original idea by Martin Devera
   8  */
   9
  10 #include <linux/module.h>
  11 #include <linux/types.h>
  12 #include <linux/kernel.h>
  13 #include <linux/string.h>
  14 #include <linux/errno.h>
  15 #include <linux/skbuff.h>
  16 #include <net/netlink.h>
  17 #include <net/sch_generic.h>
  18 #include <net/pkt_sched.h>
  19
  20
  21 /*      Simple Token Bucket Filter.
  22         =======================================
  23
  24         SOURCE.
  25         -------
  26
  27         None.
  28
  29         Description.
  30         ------------
  31
  32         A data flow obeys TBF with rate R and depth B, if for any
  33         time interval t_i...t_f the number of transmitted bits
  34         does not exceed B + R*(t_f-t_i).
  35
  36         Packetized version of this definition:
  37         The sequence of packets of sizes s_i served at moments t_i
  38         obeys TBF, if for any i<=k:
  39
  40         s_i+....+s_k <= B + R*(t_k - t_i)
  41
  42         Algorithm.
  43         ----------
  44
  45         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  46
  47         N(t+delta) = min{B/R, N(t) + delta}
  48
  49         If the first packet in queue has length S, it may be
  50         transmitted only at the time t_* when S/R <= N(t_*),
  51         and in this case N(t) jumps:
  52
  53         N(t_* + 0) = N(t_* - 0) - S/R.
  54
  55
  56
  57         Actually, QoS requires two TBF to be applied to a data stream.
  58         One of them controls steady state burst size, another
  59         one with rate P (peak rate) and depth M (equal to link MTU)
  60         limits bursts at a smaller time scale.
  61
  62         It is easy to see that P>R, and B>M. If P is infinity, this double
  63         TBF is equivalent to a single one.
  64
  65         When TBF works in reshaping mode, latency is estimated as:
  66
  67         lat = max ((L-B)/R, (L-M)/P)
  68
  69
  70         NOTES.
  71         ------
  72
  73         If TBF throttles, it starts a watchdog timer, which will wake it up
  74         when it is ready to transmit.
  75         Note that the minimal timer resolution is 1/HZ.
  76         If no new packets arrive during this period,
  77         or if the device is not awaken by EOI for some previous packet,
  78         TBF can stop its activity for 1/HZ.
  79
  80
  81         This means, that with depth B, the maximal rate is
  82
  83         R_crit = B*HZ
  84
  85         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
  86
  87         Note that the peak rate TBF is much more tough: with MTU 1500
  88         P_crit = 150Kbytes/sec. So, if you need greater peak
  89         rates, use alpha with HZ=1000 :-)
  90
  91         With classful TBF, limit is just kept for backwards compatibility.
  92         It is passed to the default bfifo qdisc - if the inner qdisc is
  93         changed the limit is not effective anymore.
  94 */
  95
  96 struct tbf_sched_data {
  97 /* Parameters */
  98         u32             limit;          /* Maximal length of backlog: bytes */
  99         u32             max_size;
 100         s64             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 101         s64             mtu;
 102         struct psched_ratecfg rate;
 103         struct psched_ratecfg peak;
 104
 105 /* Variables */
 106         s64     tokens;                 /* Current number of B tokens */
 107         s64     ptokens;                /* Current number of P tokens */
 108         s64     t_c;                    /* Time check-point */
 109         struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 110         struct qdisc_watchdog watchdog; /* Watchdog timer */
 111 };
 112
 113
 114 /* Time to Length, convert time in ns to length in bytes
 115  * to determinate how many bytes can be sent in given time.
 116  */
 117 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
 118                          u64 time_in_ns)
 119 {
 120         /* The formula is :
 121          * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
 122          */
 123         u64 len = time_in_ns * r->rate_bytes_ps;
 124
 125         do_div(len, NSEC_PER_SEC);
 126
 127         if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
 128                 do_div(len, 53);
 129                 len = len * 48;
 130         }
 131
 132         if (len > r->overhead)
 133                 len -= r->overhead;
 134         else
 135                 len = 0;
 136
 137         return len;
 138 }
 139
 140 /* GSO packet is too big, segment it so that tbf can transmit
 141  * each segment in time
 142  */
 143 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 144                        struct sk_buff **to_free)
 145 {
 146         struct tbf_sched_data *q = qdisc_priv(sch);
 147         struct sk_buff *segs, *nskb;
 148         netdev_features_t features = netif_skb_features(skb);
 149         unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
 150         int ret, nb;
 151
 152         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 153
 154         if (IS_ERR_OR_NULL(segs))
 155                 return qdisc_drop(skb, sch, to_free);
 156
 157         nb = 0;
 158         while (segs) {
 159                 nskb = segs->next;
 160                 skb_mark_not_on_list(segs);
 161                 qdisc_skb_cb(segs)->pkt_len = segs->len;
 162                 len += segs->len;
 163                 ret = qdisc_enqueue(segs, q->qdisc, to_free);
 164                 if (ret != NET_XMIT_SUCCESS) {
 165                         if (net_xmit_drop_count(ret))
 166                                 qdisc_qstats_drop(sch);
 167                 } else {
 168                         nb++;
 169                 }
 170                 segs = nskb;
 171         }
 172         sch->q.qlen += nb;
 173         if (nb > 1)
 174                 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 175         consume_skb(skb);
 176         return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 177 }
 178
 179 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 180                        struct sk_buff **to_free)
 181 {
 182         struct tbf_sched_data *q = qdisc_priv(sch);
 183         unsigned int len = qdisc_pkt_len(skb);
 184         int ret;
 185
 186         if (qdisc_pkt_len(skb) > q->max_size) {
 187                 if (skb_is_gso(skb) &&
 188                     skb_gso_validate_mac_len(skb, q->max_size))
 189                         return tbf_segment(skb, sch, to_free);
 190                 return qdisc_drop(skb, sch, to_free);
 191         }
 192         ret = qdisc_enqueue(skb, q->qdisc, to_free);
 193         if (ret != NET_XMIT_SUCCESS) {
 194                 if (net_xmit_drop_count(ret))
 195                         qdisc_qstats_drop(sch);
 196                 return ret;
 197         }
 198
 199         sch->qstats.backlog += len;
 200         sch->q.qlen++;
 201         return NET_XMIT_SUCCESS;
 202 }
 203
 204 static bool tbf_peak_present(const struct tbf_sched_data *q)
 205 {
 206         return q->peak.rate_bytes_ps;
 207 }
 208
 209 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 210 {
 211         struct tbf_sched_data *q = qdisc_priv(sch);
 212         struct sk_buff *skb;
 213
 214         skb = q->qdisc->ops->peek(q->qdisc);
 215
 216         if (skb) {
 217                 s64 now;
 218                 s64 toks;
 219                 s64 ptoks = 0;
 220                 unsigned int len = qdisc_pkt_len(skb);
 221
 222                 now = ktime_get_ns();
 223                 toks = min_t(s64, now - q->t_c, q->buffer);
 224
 225                 if (tbf_peak_present(q)) {
 226                         ptoks = toks + q->ptokens;
 227                         if (ptoks > q->mtu)
 228                                 ptoks = q->mtu;
 229                         ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 230                 }
 231                 toks += q->tokens;
 232                 if (toks > q->buffer)
 233                         toks = q->buffer;
 234                 toks -= (s64) psched_l2t_ns(&q->rate, len);
 235
 236                 if ((toks|ptoks) >= 0) {
 237                         skb = qdisc_dequeue_peeked(q->qdisc);
 238                         if (unlikely(!skb))
 239                                 return NULL;
 240
 241                         q->t_c = now;
 242                         q->tokens = toks;
 243                         q->ptokens = ptoks;
 244                         qdisc_qstats_backlog_dec(sch, skb);
 245                         sch->q.qlen--;
 246                         qdisc_bstats_update(sch, skb);
 247                         return skb;
 248                 }
 249
 250                 qdisc_watchdog_schedule_ns(&q->watchdog,
 251                                            now + max_t(long, -toks, -ptoks));
 252
 253                 /* Maybe we have a shorter packet in the queue,
 254                    which can be sent now. It sounds cool,
 255                    but, however, this is wrong in principle.
 256                    We MUST NOT reorder packets under these circumstances.
 257
 258                    Really, if we split the flow into independent
 259                    subflows, it would be a very good solution.
 260                    This is the main idea of all FQ algorithms
 261                    (cf. CSZ, HPFQ, HFSC)
 262                  */
 263
 264                 qdisc_qstats_overlimit(sch);
 265         }
 266         return NULL;
 267 }
 268
 269 static void tbf_reset(struct Qdisc *sch)
 270 {
 271         struct tbf_sched_data *q = qdisc_priv(sch);
 272
 273         qdisc_reset(q->qdisc);
 274         sch->qstats.backlog = 0;
 275         sch->q.qlen = 0;
 276         q->t_c = ktime_get_ns();
 277         q->tokens = q->buffer;
 278         q->ptokens = q->mtu;
 279         qdisc_watchdog_cancel(&q->watchdog);
 280 }
 281
 282 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 283         [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
 284         [TCA_TBF_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 285         [TCA_TBF_PTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 286         [TCA_TBF_RATE64]        = { .type = NLA_U64 },
 287         [TCA_TBF_PRATE64]       = { .type = NLA_U64 },
 288         [TCA_TBF_BURST] = { .type = NLA_U32 },
 289         [TCA_TBF_PBURST] = { .type = NLA_U32 },
 290 };
 291
 292 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 293                       struct netlink_ext_ack *extack)
 294 {
 295         int err;
 296         struct tbf_sched_data *q = qdisc_priv(sch);
 297         struct nlattr *tb[TCA_TBF_MAX + 1];
 298         struct tc_tbf_qopt *qopt;
 299         struct Qdisc *child = NULL;
 300         struct psched_ratecfg rate;
 301         struct psched_ratecfg peak;
 302         u64 max_size;
 303         s64 buffer, mtu;
 304         u64 rate64 = 0, prate64 = 0;
 305
 306         err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
 307                                           NULL);
 308         if (err < 0)
 309                 return err;
 310
 311         err = -EINVAL;
 312         if (tb[TCA_TBF_PARMS] == NULL)
 313                 goto done;
 314
 315         qopt = nla_data(tb[TCA_TBF_PARMS]);
 316         if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
 317                 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
 318                                               tb[TCA_TBF_RTAB],
 319                                               NULL));
 320
 321         if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
 322                         qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
 323                                                       tb[TCA_TBF_PTAB],
 324                                                       NULL));
 325
 326         buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
 327         mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
 328
 329         if (tb[TCA_TBF_RATE64])
 330                 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
 331         psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
 332
 333         if (tb[TCA_TBF_BURST]) {
 334                 max_size = nla_get_u32(tb[TCA_TBF_BURST]);
 335                 buffer = psched_l2t_ns(&rate, max_size);
 336         } else {
 337                 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
 338         }
 339
 340         if (qopt->peakrate.rate) {
 341                 if (tb[TCA_TBF_PRATE64])
 342                         prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
 343                 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
 344                 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
 345                         pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
 346                                         peak.rate_bytes_ps, rate.rate_bytes_ps);
 347                         err = -EINVAL;
 348                         goto done;
 349                 }
 350
 351                 if (tb[TCA_TBF_PBURST]) {
 352                         u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
 353                         max_size = min_t(u32, max_size, pburst);
 354                         mtu = psched_l2t_ns(&peak, pburst);
 355                 } else {
 356                         max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
 357                 }
 358         } else {
 359                 memset(&peak, 0, sizeof(peak));
 360         }
 361
 362         if (max_size < psched_mtu(qdisc_dev(sch)))
 363                 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
 364                                     max_size, qdisc_dev(sch)->name,
 365                                     psched_mtu(qdisc_dev(sch)));
 366
 367         if (!max_size) {
 368                 err = -EINVAL;
 369                 goto done;
 370         }
 371
 372         if (q->qdisc != &noop_qdisc) {
 373                 err = fifo_set_limit(q->qdisc, qopt->limit);
 374                 if (err)
 375                         goto done;
 376         } else if (qopt->limit > 0) {
 377                 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
 378                                          extack);
 379                 if (IS_ERR(child)) {
 380                         err = PTR_ERR(child);
 381                         goto done;
 382                 }
 383
 384                 /* child is fifo, no need to check for noop_qdisc */
 385                 qdisc_hash_add(child, true);
 386         }
 387
 388         sch_tree_lock(sch);
 389         if (child) {
 390                 qdisc_tree_flush_backlog(q->qdisc);
 391                 qdisc_put(q->qdisc);
 392                 q->qdisc = child;
 393         }
 394         q->limit = qopt->limit;
 395         if (tb[TCA_TBF_PBURST])
 396                 q->mtu = mtu;
 397         else
 398                 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 399         q->max_size = max_size;
 400         if (tb[TCA_TBF_BURST])
 401                 q->buffer = buffer;
 402         else
 403                 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 404         q->tokens = q->buffer;
 405         q->ptokens = q->mtu;
 406
 407         memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
 408         memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
 409
 410         sch_tree_unlock(sch);
 411         err = 0;
 412 done:
 413         return err;
 414 }
 415
 416 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
 417                     struct netlink_ext_ack *extack)
 418 {
 419         struct tbf_sched_data *q = qdisc_priv(sch);
 420
 421         qdisc_watchdog_init(&q->watchdog, sch);
 422         q->qdisc = &noop_qdisc;
 423
 424         if (!opt)
 425                 return -EINVAL;
 426
 427         q->t_c = ktime_get_ns();
 428
 429         return tbf_change(sch, opt, extack);
 430 }
 431
 432 static void tbf_destroy(struct Qdisc *sch)
 433 {
 434         struct tbf_sched_data *q = qdisc_priv(sch);
 435
 436         qdisc_watchdog_cancel(&q->watchdog);
 437         qdisc_put(q->qdisc);
 438 }
 439
 440 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 441 {
 442         struct tbf_sched_data *q = qdisc_priv(sch);
 443         struct nlattr *nest;
 444         struct tc_tbf_qopt opt;
 445
 446         sch->qstats.backlog = q->qdisc->qstats.backlog;
 447         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
 448         if (nest == NULL)
 449                 goto nla_put_failure;
 450
 451         opt.limit = q->limit;
 452         psched_ratecfg_getrate(&opt.rate, &q->rate);
 453         if (tbf_peak_present(q))
 454                 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 455         else
 456                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 457         opt.mtu = PSCHED_NS2TICKS(q->mtu);
 458         opt.buffer = PSCHED_NS2TICKS(q->buffer);
 459         if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 460                 goto nla_put_failure;
 461         if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
 462             nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
 463                               TCA_TBF_PAD))
 464                 goto nla_put_failure;
 465         if (tbf_peak_present(q) &&
 466             q->peak.rate_bytes_ps >= (1ULL << 32) &&
 467             nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
 468                               TCA_TBF_PAD))
 469                 goto nla_put_failure;
 470
 471         return nla_nest_end(skb, nest);
 472
 473 nla_put_failure:
 474         nla_nest_cancel(skb, nest);
 475         return -1;
 476 }
 477
 478 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 479                           struct sk_buff *skb, struct tcmsg *tcm)
 480 {
 481         struct tbf_sched_data *q = qdisc_priv(sch);
 482
 483         tcm->tcm_handle |= TC_H_MIN(1);
 484         tcm->tcm_info = q->qdisc->handle;
 485
 486         return 0;
 487 }
 488
 489 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 490                      struct Qdisc **old, struct netlink_ext_ack *extack)
 491 {
 492         struct tbf_sched_data *q = qdisc_priv(sch);
 493
 494         if (new == NULL)
 495                 new = &noop_qdisc;
 496
 497         *old = qdisc_replace(sch, new, &q->qdisc);
 498         return 0;
 499 }
 500
 501 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 502 {
 503         struct tbf_sched_data *q = qdisc_priv(sch);
 504         return q->qdisc;
 505 }
 506
 507 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
 508 {
 509         return 1;
 510 }
 511
 512 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 513 {
 514         if (!walker->stop) {
 515                 if (walker->count >= walker->skip)
 516                         if (walker->fn(sch, 1, walker) < 0) {
 517                                 walker->stop = 1;
 518                                 return;
 519                         }
 520                 walker->count++;
 521         }
 522 }
 523
 524 static const struct Qdisc_class_ops tbf_class_ops = {
 525         .graft          =       tbf_graft,
 526         .leaf           =       tbf_leaf,
 527         .find           =       tbf_find,
 528         .walk           =       tbf_walk,
 529         .dump           =       tbf_dump_class,
 530 };
 531
 532 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 533         .next           =       NULL,
 534         .cl_ops         =       &tbf_class_ops,
 535         .id             =       "tbf",
 536         .priv_size      =       sizeof(struct tbf_sched_data),
 537         .enqueue        =       tbf_enqueue,
 538         .dequeue        =       tbf_dequeue,
 539         .peek           =       qdisc_peek_dequeued,
 540         .init           =       tbf_init,
 541         .reset          =       tbf_reset,
 542         .destroy        =       tbf_destroy,
 543         .change         =       tbf_change,
 544         .dump           =       tbf_dump,
 545         .owner          =       THIS_MODULE,
 546 };
 547
 548 static int __init tbf_module_init(void)
 549 {
 550         return register_qdisc(&tbf_qdisc_ops);
 551 }
 552
 553 static void __exit tbf_module_exit(void)
 554 {
 555         unregister_qdisc(&tbf_qdisc_ops);
 556 }
 557 module_init(tbf_module_init)
 558 module_exit(tbf_module_exit)
 559 MODULE_LICENSE("GPL");