Linux内核中游量控制(24)

Linux内核中流量控制(24)
本文档的Copyleft归yfydz所有，使用GPL发布，可以自由拷贝，转载，转载时请保持文档的完整性，严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源：http://yfydz.cublog.cn
8.10 mirred(mirror and redirection)packet mirroring and redirect actionsmirred动作是对数据进行镜像和重定向操作, 将数据包从指定网卡发出, 在net/sched/act_mirred.c中定义8.10.1 数据结构和动作操作结构/* include/linux/tc_act/tc_ipt.h */struct tc_mirred{ tc_gen;// 动作 int                     eaction;   /* one of IN/EGRESS_MIRROR/REDIR */// 数据包发出网卡索引号 __u32                   ifindex;  /* ifindex of egress port */};/* include/net/tc_act/tc_ipt.h */// mirred动作结构struct tcf_mirred { struct tcf_common common; int   tcfm_eaction; int   tcfm_ifindex; int   tcfm_ok_push; struct net_device *tcfm_dev;};#define to_mirred(pc) \ container_of(pc, struct tcf_mirred, common)/* net/sched/act_ipt.c */static struct tcf_hashinfo mirred_hash_info = { .htab = tcf_mirred_ht, .hmask = MIRRED_TAB_MASK, .lock = &mirred_lock,}; // mirred动作操作结构static struct tc_action_ops act_mirred_ops = {// 名称 .kind  = "mirred", .hinfo  = &mirred_hash_info,// 类型 .type  = TCA_ACT_MIRRED, .capab  = TCA_CAP_NONE, .owner  = THIS_MODULE, .act  = tcf_mirred, .dump  = tcf_mirred_dump, .cleanup = tcf_mirred_cleanup,// 查找, 通用函数 .lookup  = tcf_hash_search, .init  = tcf_mirred_init,// 遍历, 通用函数 .walk  = tcf_generic_walker}; 8.10.2 初始化static int tcf_mirred_init(struct rtattr *rta, struct rtattr *est,      struct tc_action *a, int ovr, int bind){ struct rtattr *tb[TCA_MIRRED_MAX]; struct tc_mirred *parm; struct tcf_mirred *m; struct tcf_common *pc; struct net_device *dev = NULL; int ret = 0; int ok_push = 0;// 解析参数, 保存于tb数组, 失败返回 if (rta == NULL || rtattr_parse_nested(tb, TCA_MIRRED_MAX, rta) &lt; 0)  return -EINVAL;// 必须要有MIRRED参数 if (tb[TCA_MIRRED_PARMS-1] == NULL ||     RTA_PAYLOAD(tb[TCA_MIRRED_PARMS-1]) &lt; sizeof(*parm))  return -EINVAL; parm = RTA_DATA(tb[TCA_MIRRED_PARMS-1]);// 如果定义了网卡索引号 if (parm-&gt;ifindex) {// 查找相应的网卡设备结构  dev = __dev_get_by_index(parm-&gt;ifindex);  if (dev == NULL)   return -ENODEV;  switch (dev-&gt;type) {// 以下类型的网卡扩展硬件头, 这些通常是虚拟网卡   case ARPHRD_TUNNEL:   case ARPHRD_TUNNEL6:   case ARPHRD_SIT:   case ARPHRD_IPGRE:   case ARPHRD_VOID:   case ARPHRD_NONE:    ok_push = 0;    break;   default:// 其他类型网卡需要扩展硬件头    ok_push = 1;    break;  } }// 根据索引号查找common节点, 绑定到a节点(priv) pc = tcf_hash_check(parm-&gt;index, a, bind, &mirred_hash_info); if (!pc) {// 如果节点为空// 必须要有网卡参数  if (!parm-&gt;ifindex)   return -EINVAL;// 创建新的common节点  pc = tcf_hash_create(parm-&gt;index, est, a, sizeof(*m), bind,         &mirred_idx_gen, &mirred_hash_info);  if (unlikely(!pc))   return -ENOMEM;// 新建标志  ret = ACT_P_CREATED; } else {// ovr是替代标志, 如果不是替代操作, 对象已经存在, 操作失败  if (!ovr) {   tcf_mirred_release(to_mirred(pc), bind);   return -EEXIST;  } }// 转换为mirred动作结构 m = to_mirred(pc); spin_lock_bh(&m-&gt;tcf_lock);// 动作 m-&gt;tcf_action = parm-&gt;action;// 实际动作 m-&gt;tcfm_eaction = parm-&gt;eaction; if (parm-&gt;ifindex) {// 填充网卡参数  m-&gt;tcfm_ifindex = parm-&gt;ifindex;// 如果不是新建操作, 减少网卡计数, 因为已经引用过了  if (ret != ACT_P_CREATED)   dev_put(m-&gt;tcfm_dev);// 网卡  m-&gt;tcfm_dev = dev;  dev_hold(dev);// 硬件头扩展标志  m-&gt;tcfm_ok_push = ok_push; } spin_unlock_bh(&m-&gt;tcf_lock);// 如果是新建节点, 插入哈希表 if (ret == ACT_P_CREATED)  tcf_hash_insert(pc, &mirred_hash_info); return ret;}8.10.3 动作// 将数据包从指定网卡发出static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,        struct tcf_result *res){// mirred动作结构 struct tcf_mirred *m = a-&gt;priv; struct net_device *dev; struct sk_buff *skb2 = NULL;// 数据包自身的动作信息 u32 at = G_TC_AT(skb-&gt;tc_verd); spin_lock(&m-&gt;tcf_lock);// 网卡 dev = m-&gt;tcfm_dev;// 最后使用时间 m-&gt;tcf_tm.lastuse = jiffies; if (!(dev-&gt;flags&IFF_UP) ) {// 如果该网卡没运行, 丢包  if (net_ratelimit())   printk("mirred to Houston: device %s is gone!\n",          dev-&gt;name);bad_mirred:// 如果已经分配了克隆包, 释放  if (skb2 != NULL)   kfree_skb(skb2);// 统计参数更新// 阻塞数  m-&gt;tcf_qstats.overlimits++;// 包数, 总长度  m-&gt;tcf_bstats.bytes += skb-&gt;len;  m-&gt;tcf_bstats.packets++;  spin_unlock(&m-&gt;tcf_lock);  /* should we be asking for packet to be dropped?   * may make sense for redirect case only  */// 返回丢包  return TC_ACT_SHOT; }// 克隆数据包用于镜像或重定向 skb2 = skb_clone(skb, GFP_ATOMIC);// 失败, 返回 if (skb2 == NULL)  goto bad_mirred;// 如果实际动作既不是镜像也不是重定向, 出错返回 if (m-&gt;tcfm_eaction != TCA_EGRESS_MIRROR &&     m-&gt;tcfm_eaction != TCA_EGRESS_REDIR) {  if (net_ratelimit())   printk("tcf_mirred unknown action %d\n",          m-&gt;tcfm_eaction);  goto bad_mirred; }// 统计数更新 m-&gt;tcf_bstats.bytes += skb2-&gt;len; m-&gt;tcf_bstats.packets++;// 如果不是发出的, 根据需要扩展硬件头 if (!(at & AT_EGRESS))  if (m-&gt;tcfm_ok_push)   skb_push(skb2, skb2-&gt;dev-&gt;hard_header_len); /* mirror is always swallowed */// 实际动作不是镜像, 重新设置TC判定值 if (m-&gt;tcfm_eaction != TCA_EGRESS_MIRROR)  skb2-&gt;tc_verd = SET_TC_FROM(skb2-&gt;tc_verd, at);// 将克隆的数据包从指定网卡发出 skb2-&gt;dev = dev;// 克隆数据包输入网卡为原数据包的发出网卡 skb2-&gt;input_dev = skb-&gt;dev; dev_queue_xmit(skb2); spin_unlock(&m-&gt;tcf_lock);// 返回对原数据包skb的动作 return m-&gt;tcf_action;}8.10.4 输出static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref){// 数据包缓冲区位置 unsigned char *b = skb-&gt;tail;// mirred动作结构 struct tcf_mirred *m = a-&gt;priv;// mirred选项参数 struct tc_mirred opt; struct tcf_t t;// 填充mirred选项参数// 索引号 opt.index = m-&gt;tcf_index;// 基本动作 opt.action = m-&gt;tcf_action;// 引用数 opt.refcnt = m-&gt;tcf_refcnt - ref;// 绑定数 opt.bindcnt = m-&gt;tcf_bindcnt - bind;// 克隆包动作 opt.eaction = m-&gt;tcfm_eaction;// 发出网卡 opt.ifindex = m-&gt;tcfm_ifindex; RTA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);// 时间参数// 建立时间 t.install = jiffies_to_clock_t(jiffies - m-&gt;tcf_tm.install);// 最后使用时间 t.lastuse = jiffies_to_clock_t(jiffies - m-&gt;tcf_tm.lastuse);// 到期时间 t.expires = jiffies_to_clock_t(m-&gt;tcf_tm.expires); RTA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); return skb-&gt;len;rtattr_failure: skb_trim(skb, b - skb-&gt;data); return -1;} 8.10.5 清除// 只是tcf_mirred_release的转换函数static int tcf_mirred_cleanup(struct tc_action *a, int bind){ struct tcf_mirred *m = a-&gt;priv; if (m)  return tcf_mirred_release(m, bind); return 0;} // mirred释放操作static inline int tcf_mirred_release(struct tcf_mirred *m, int bind){ if (m) {// 减少版本数  if (bind)   m-&gt;tcf_bindcnt--;// 减少引用数  m-&gt;tcf_refcnt--;// 引用数和绑定数都为0时释放节点  if(!m-&gt;tcf_bindcnt && m-&gt;tcf_refcnt &lt;= 0) {// 减少网卡引用   dev_put(m-&gt;tcfm_dev);// 释放动作节点   tcf_hash_destroy(&m-&gt;common, &mirred_hash_info);   return 1;  } } return 0;} 8.11 pedit(Generic packet editor)gedit定义一个通用的数据包编辑处理结果方法, 代码在net/sched/act_pedit.c中定义.8.11.1 数据结构和动作操作结构/* include/net/tc_act/tc_pedit.h */// pedit动作结构struct tcf_pedit { struct tcf_common common;// key的数量 unsigned char  tcfp_nkeys;// 标志 unsigned char  tcfp_flags;// key数组 struct tc_pedit_key *tcfp_keys;};#define to_pedit(pc) \ container_of(pc, struct tcf_pedit, common)/* include/linux/tc_act/tc_pedit.h */// key结构用于定义对数据包进行的操作处理, 对数据包中指定偏移的数据进行更改struct tc_pedit_key{ __u32           mask;  /* AND */ __u32           val;   /*XOR */ __u32           off;  /*offset */ __u32           at; __u32           offmask; __u32           shift;};                                                                               struct tc_pedit_sel{ tc_gen; unsigned char           nkeys; unsigned char           flags; struct tc_pedit_key     keys[0];};#define tc_pedit tc_pedit_sel/* net/sched/act_gedit.c */// PEDIT哈希表信息结构static struct tcf_hashinfo pedit_hash_info = { .htab = tcf_pedit_ht, .hmask = PEDIT_TAB_MASK, .lock = &pedit_lock,}; // PEDIT动作操作结构static struct tc_action_ops act_pedit_ops = { .kind  = "pedit", .hinfo  = &pedit_hash_info, .type  = TCA_ACT_PEDIT, .capab  = TCA_CAP_NONE, .owner  = THIS_MODULE, .act  = tcf_pedit, .dump  = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup,// 通用函数 .lookup  = tcf_hash_search, .init  = tcf_pedit_init,// 通用函数 .walk  = tcf_generic_walker}; 8.11.2 初始化static int tcf_pedit_init(struct rtattr *rta, struct rtattr *est,     struct tc_action *a, int ovr, int bind){ struct rtattr *tb[TCA_PEDIT_MAX]; struct tc_pedit *parm; int ret = 0; struct tcf_pedit *p; struct tcf_common *pc; struct tc_pedit_key *keys = NULL; int ksize;// 解析输入参数, 结果保存到tb数组, 失败则返回 if (rta == NULL || rtattr_parse_nested(tb, TCA_PEDIT_MAX, rta) &lt; 0)  return -EINVAL;// 解析参数, PEDIT参数不能为空 if (tb[TCA_PEDIT_PARMS - 1] == NULL ||     RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) &lt; sizeof(*parm))  return -EINVAL;// 参数指针 parm = RTA_DATA(tb[TCA_PEDIT_PARMS-1]);// key数组大小 ksize = parm-&gt;nkeys * sizeof(struct tc_pedit_key); if (RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) &lt; sizeof(*parm) + ksize)  return -EINVAL;// 根据索引号查找common结构 pc = tcf_hash_check(parm-&gt;index, a, bind, &pedit_hash_info); if (!pc) {// 没找到// 如果key数量为0, 非法参数  if (!parm-&gt;nkeys)   return -EINVAL;// 新建一个common结构  pc = tcf_hash_create(parm-&gt;index, est, a, sizeof(*p), bind,         &pedit_idx_gen, &pedit_hash_info);  if (unlikely(!pc))   return -ENOMEM;// 获取PEDIT结构指针  p = to_pedit(pc);// 分配key数组空间  keys = kmalloc(ksize, GFP_KERNEL);// 如果失败, 将刚分配的common空间释放后返回  if (keys == NULL) {   kfree(pc);   return -ENOMEM;  }// 新建标志  ret = ACT_P_CREATED; } else {// 找到的话// 获取PEDIT结构指针  p = to_pedit(pc);// 检查是否是替代操作, 否则失败, 对象已经存在  if (!ovr) {   tcf_hash_release(pc, bind, &pedit_hash_info);   return -EEXIST;  }// 如果key数组大小和原来的不同, 重新分配key数组空间  if (p-&gt;tcfp_nkeys && p-&gt;tcfp_nkeys != parm-&gt;nkeys) {   keys = kmalloc(ksize, GFP_KERNEL);   if (keys == NULL)    return -ENOMEM;  } } spin_lock_bh(&p-&gt;tcf_lock);// 填写GEDIT结构参数// 标志 p-&gt;tcfp_flags = parm-&gt;flags;// 动作结果 p-&gt;tcf_action = parm-&gt;action;// 如果是替代操作, 而且key数量为0时, keys为空 if (keys) {// 释放原来的key数组空间  kfree(p-&gt;tcfp_keys);// 更新key参数  p-&gt;tcfp_keys = keys;  p-&gt;tcfp_nkeys = parm-&gt;nkeys; }// 复制key数组信息 memcpy(p-&gt;tcfp_keys, parm-&gt;keys, ksize); spin_unlock_bh(&p-&gt;tcf_lock);// 如果是新建节点, 插入哈希表 if (ret == ACT_P_CREATED)  tcf_hash_insert(pc, &pedit_hash_info); return ret;} 8.11.3 动作// 只修改数据包数据, 不管校验和的重新计算, 所以应该不适合所有协议的static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,       struct tcf_result *res){// PEDIT动作结构为a的私有数据 struct tcf_pedit *p = a-&gt;priv; int i, munged = 0; u8 *pptr;// 如果没有TC_OK2MUNGE(可以修改)标志, 如果是克隆包等就不能直接修改, 必须是独立的包 if (!(skb-&gt;tc_verd & TC_OK2MUNGE)) {  /* should we set skb-&gt;cloned? */// 重新分配数据包的data缓冲区  if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {// 分配失败直接返回动作结果   return p-&gt;tcf_action;  } }// 网络层数据头 pptr = skb-&gt;nh.raw; spin_lock(&p-&gt;tcf_lock);//pedit动作结构的最新使用时间 p-&gt;tcf_tm.lastuse = jiffies;// 存在key if (p-&gt;tcfp_nkeys &gt; 0) {  struct tc_pedit_key *tkey = p-&gt;tcfp_keys;// 循环遍历所有key  for (i = p-&gt;tcfp_nkeys; i &gt; 0; i--, tkey++) {   u32 *ptr;   int offset = tkey-&gt;off;// 如果偏移掩码非0   if (tkey-&gt;offmask) {// at指定数据位置, 不能超过数据包长度    if (skb-&gt;len &gt; tkey-&gt;at) {// 定位到at位置      char *j = pptr + tkey-&gt;at;// 根据该出字节的值更新偏移offset      offset += ((*j & tkey-&gt;offmask) &gt;&gt;                tkey-&gt;shift);    } else {     goto bad;    }   }// 偏移量必须4字节对齐   if (offset % 4) {    printk("offset must be on 32 bit boundaries\n");    goto bad;   }// 检查数据包长度是否合法, 数据偏移是否合法, 不能超过数据包长   if (skb-&gt;len &lt; 0 ||       (offset &gt; 0 && offset &gt; skb-&gt;len)) {    printk("offset %d cant exceed pkt length %d\n",           offset, skb-&gt;len);    goto bad;   }// 定位要编辑的数据位置   ptr = (u32 *)(pptr+offset);   /* just do it, baby */// 更新该位置处的数据: 和掩码与, 再和数值异或   *ptr = ((*ptr & tkey-&gt;mask) ^ tkey-&gt;val);   munged++;  }// 设置数据包已经修改标志    if (munged)   skb-&gt;tc_verd = SET_TC_MUNGED(skb-&gt;tc_verd);  goto done; } else {  printk("pedit BUG: index %d\n", p-&gt;tcf_index); }bad:// 更新阻塞数 p-&gt;tcf_qstats.overlimits++;done:// 更新包数, 字节数 p-&gt;tcf_bstats.bytes += skb-&gt;len; p-&gt;tcf_bstats.packets++; spin_unlock(&p-&gt;tcf_lock);// 返回动作 return p-&gt;tcf_action;} 8.11.4 输出static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,     int bind, int ref){// 数据包缓冲区起始位置 unsigned char *b = skb-&gt;tail; struct tcf_pedit *p = a-&gt;priv;// PEDIT选项结构, 中间变量, 其他类型的动作直接用结构, 也就是在堆栈中进行 struct tc_pedit *opt;// 时间参数 struct tcf_t t; int s;// 选项结构参数长度, 结构长度加所有key的长度   s = sizeof(*opt) + p-&gt;tcfp_nkeys * sizeof(struct tc_pedit_key); /* netlink spinlocks held above us - must use ATOMIC */// 分配空间 opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt))  return -ENOBUFS;// 复制key数值 memcpy(opt-&gt;keys, p-&gt;tcfp_keys,        p-&gt;tcfp_nkeys * sizeof(struct tc_pedit_key));// 填写选项结构参数// 索引号 opt-&gt;index = p-&gt;tcf_index;// key数量 opt-&gt;nkeys = p-&gt;tcfp_nkeys;// 标志 opt-&gt;flags = p-&gt;tcfp_flags;// 动作 opt-&gt;action = p-&gt;tcf_action;// 引用数 opt-&gt;refcnt = p-&gt;tcf_refcnt - ref;// 绑定数 opt-&gt;bindcnt = p-&gt;tcf_bindcnt - bind;// 填写到数据包 RTA_PUT(skb, TCA_PEDIT_PARMS, s, opt);// 填写时间// 生成时间 t.install = jiffies_to_clock_t(jiffies - p-&gt;tcf_tm.install);// 最新使用时间 t.lastuse = jiffies_to_clock_t(jiffies - p-&gt;tcf_tm.lastuse);// 到期时间 t.expires = jiffies_to_clock_t(p-&gt;tcf_tm.expires);// 拷贝到skb缓冲区 RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t);// 释放选项空间, 不需要了 kfree(opt);// 返回当前数据长度 return skb-&gt;len;rtattr_failure: skb_trim(skb, b - skb-&gt;data); kfree(opt); return -1;} 8.11.5 清除static int tcf_pedit_cleanup(struct tc_action *a, int bind){// pedit动作结构 struct tcf_pedit *p = a-&gt;priv; if (p) {// key数组地址  struct tc_pedit_key *keys = p-&gt;tcfp_keys;// 先释放节点  if (tcf_hash_release(&p-&gt;common, bind, &pedit_hash_info)) {// 成功的话再释放key数组空间   kfree(keys);   return 1;  } } return 0;} 8.12 police( Input police filter) police相当来说是最复杂的一个动作处理方法了, 而且根据内核是否定义CONFIG_NET_CLS_ACT, 处理方法有所不同, 本文只分析定义了该选项的情况, 该动作和其他动作不同, 有自己的专有链表进行保存, 而不是系统的链表。该方法使用了TBF流控算法，对超过限制值的数据包可进行指定的结果操作，代码在net/sched/act_police.c中定义.8.12.1 数据结构和动作操作结构/* include/linux/plt_cls.h */struct tc_police{ __u32   index; int   action;#define TC_POLICE_UNSPEC TC_ACT_UNSPEC#define TC_POLICE_OK  TC_ACT_OK#define TC_POLICE_RECLASSIFY TC_ACT_RECLASSIFY#define TC_POLICE_SHOT  TC_ACT_SHOT#define TC_POLICE_PIPE  TC_ACT_PIPE __u32   limit; __u32   burst; __u32   mtu; struct tc_ratespec rate; struct tc_ratespec peakrate; int    refcnt; int    bindcnt; __u32   capab;};/* include/net/act_api.h */// TCF警察struct tcf_police {// 通用结构 struct tcf_common common;// 流控限制内的处理结果 int   tcfp_result;// 速率 u32   tcfp_ewma_rate;// 爆发率 u32   tcfp_burst;// MTU u32   tcfp_mtu;// 令牌 u32   tcfp_toks;// P令牌(peak?) u32   tcfp_ptoks;// 时间 psched_time_t  tcfp_t_c;// 速率表 struct qdisc_rate_table *tcfp_R_tab;// P速率表 struct qdisc_rate_table *tcfp_P_tab;};#define to_police(pc) \ container_of(pc, struct tcf_police, common)struct tcf_hashinfo { struct tcf_common **htab; unsigned int  hmask; rwlock_t  *lock;};/* net/sched/act_police.c */// simple哈希表信息结构static struct tcf_hashinfo police_hash_info = { .htab = tcf_police_ht, .hmask = POL_TAB_MASK, .lock = &police_lock,};/* old policer structure from before tc actions */// 老结构struct tc_police_compat{ u32   index; int   action; u32   limit; u32   burst; u32   mtu; struct tc_ratespec rate; struct tc_ratespec peakrate;}; // police动作操作结构static struct tc_action_ops act_police_ops = { .kind  = "police", .hinfo  = &police_hash_info, .type  = TCA_ID_POLICE, .capab  = TCA_CAP_NONE, .owner  = THIS_MODULE, .act  = tcf_act_police, .dump  = tcf_act_police_dump, .cleanup = tcf_act_police_cleanup,// 通用函数 .lookup  = tcf_hash_search, .init  = tcf_act_police_locate, .walk  = tcf_act_police_walker}; 8.12.2 初始化static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est,                                 struct tc_action *a, int ovr, int bind){ unsigned h; int ret = 0, err; struct rtattr *tb[TCA_POLICE_MAX]; struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; int size;// 参数解析, 结果保存于tb数组, 解析失败则返回 if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) &lt; 0)  return -EINVAL;// 必须有_POLICE_TBF参数 if (tb[TCA_POLICE_TBF-1] == NULL)  return -EINVAL;// 数据大小 size = RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]);// 必须是struct tcf_police结构或tc_police_compat结构大小 if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat))  return -EINVAL;// 获取参数 parm = RTA_DATA(tb[TCA_POLICE_TBF-1]);// 检查RESULT参数 if (tb[TCA_POLICE_RESULT-1] != NULL &&     RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))  return -EINVAL;// 代码重复了, 虽然没错, 第一次见到这种情况 if (tb[TCA_POLICE_RESULT-1] != NULL &&     RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))  return -EINVAL;// 如果索引号非0 if (parm-&gt;index) {  struct tcf_common *pc;// 根据索引号查找common节点  pc = tcf_hash_lookup(parm-&gt;index, &police_hash_info);  if (pc != NULL) {// 找到, 将common节点设置为动作结构的私有数据   a-&gt;priv = pc;// 转换为police指针   police = to_police(pc);// 绑定   if (bind) {    police-&gt;tcf_bindcnt += 1;    police-&gt;tcf_refcnt += 1;   }// 如果是更新replace, 跳转到更新操作   if (ovr)    goto override;// 否则返回成功, 不需要修改原有结构中的数据   return ret;  } }// 索引号为0, 或没找到原有的common节点, 新建节点// 分配police空间 police = kzalloc(sizeof(*police), GFP_KERNEL); if (police == NULL)  return -ENOMEM;// 新建标志 ret = ACT_P_CREATED;// 设置police结构参数// 初始化引用数为1 police-&gt;tcf_refcnt = 1; spin_lock_init(&police-&gt;tcf_lock);// 统计锁 police-&gt;tcf_stats_lock = &police-&gt;tcf_lock;// 绑定数 if (bind)  police-&gt;tcf_bindcnt = 1;override: if (parm-&gt;rate.rate) {// 如果有流量限制参数  err = -ENOMEM;// 建立流量控制结构  R_tab = qdisc_get_rtab(&parm-&gt;rate, tb[TCA_POLICE_RATE-1]);  if (R_tab == NULL)   goto failure;  if (parm-&gt;peakrate.rate) {// 如果有峰值流量限制, 建立峰值流控结构   P_tab = qdisc_get_rtab(&parm-&gt;peakrate,            tb[TCA_POLICE_PEAKRATE-1]);   if (P_tab == NULL) {    qdisc_put_rtab(R_tab);    goto failure;   }  } } /* No failure allowed after this point */ spin_lock_bh(&police-&gt;tcf_lock); if (R_tab != NULL) {// 释放原来的流量限制结构  qdisc_put_rtab(police-&gt;tcfp_R_tab);// 更新为新的结构  police-&gt;tcfp_R_tab = R_tab; } if (P_tab != NULL) {// 更新峰值流量限制结构  qdisc_put_rtab(police-&gt;tcfp_P_tab);  police-&gt;tcfp_P_tab = P_tab; }// 解析POLICE_RESULT参数 if (tb[TCA_POLICE_RESULT-1])  police-&gt;tcfp_result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);// 令牌数和爆发数初始化 police-&gt;tcfp_toks = police-&gt;tcfp_burst = parm-&gt;burst;// MTU police-&gt;tcfp_mtu = parm-&gt;mtu; if (police-&gt;tcfp_mtu == 0) {// 如果MTU为0, 改为全1  police-&gt;tcfp_mtu = ~0;// 设置峰值流控的MTU  if (police-&gt;tcfp_R_tab)   police-&gt;tcfp_mtu = 255&lt;&lt;police-&gt;tcfp_R_tab-&gt;rate.cell_log; }// 设置当前峰值流控令牌数 if (police-&gt;tcfp_P_tab)  police-&gt;tcfp_ptoks = L2T_P(police, police-&gt;tcfp_mtu);// police动作 police-&gt;tcf_action = parm-&gt;action;#ifdef CONFIG_NET_ESTIMATOR// 处理估计器 if (tb[TCA_POLICE_AVRATE-1])  police-&gt;tcfp_ewma_rate =   *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); if (est)  gen_replace_estimator(&police-&gt;tcf_bstats,          &police-&gt;tcf_rate_est,          police-&gt;tcf_stats_lock, est);#endif spin_unlock_bh(&police-&gt;tcf_lock);// 如果不是新建的节点, 可以返回了 if (ret != ACT_P_CREATED)  return ret; PSCHED_GET_TIME(police-&gt;tcfp_t_c);// 更新police结构的索引号 police-&gt;tcf_index = parm-&gt;index ? parm-&gt;index :  tcf_hash_new_index(&police_idx_gen, &police_hash_info);// 计算哈希数 h = tcf_hash(police-&gt;tcf_index, POL_TAB_MASK); write_lock_bh(&police_lock);// 将新节点插入tcf_police_ht[h]链表作为头节点, 注意不是插入系统的common哈希链表 police-&gt;tcf_next = tcf_police_ht[h]; tcf_police_ht[h] = &police-&gt;common; write_unlock_bh(&police_lock);// 将police结构作为动作a的私有数据 a-&gt;priv = police; return ret;failure:// 错误处理, 如果是新建操作, 释放新分配的police结构 if (ret == ACT_P_CREATED)  kfree(police); return err;} 8.12.3 动作static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,                          struct tcf_result *res){// police结构为a的私有数据 struct tcf_police *police = a-&gt;priv;// 当前时间 psched_time_t now;// 令牌数 long toks;// 峰值令牌数 long ptoks = 0; spin_lock(&police-&gt;tcf_lock);// 统计数更新 police-&gt;tcf_bstats.bytes += skb-&gt;len; police-&gt;tcf_bstats.packets++;#ifdef CONFIG_NET_ESTIMATOR// 判断是否阻塞, 流量超过限制值 if (police-&gt;tcfp_ewma_rate &&     police-&gt;tcf_rate_est.bps &gt;= police-&gt;tcfp_ewma_rate) {  police-&gt;tcf_qstats.overlimits++;  spin_unlock(&police-&gt;tcf_lock);  return police-&gt;tcf_action; }#endif// 如果数据包长度不超过MTU，在流量限制范围内 if (skb-&gt;len &lt;= police-&gt;tcfp_mtu) {  if (police-&gt;tcfp_R_tab == NULL) {// 没有流控表, 返回限制结果tcfp_result   spin_unlock(&police-&gt;tcf_lock);   return police-&gt;tcfp_result;  }// 获取当前数据  PSCHED_GET_TIME(now);// 计算时间对应令牌  toks = PSCHED_TDIFF_SAFE(now, police-&gt;tcfp_t_c,      police-&gt;tcfp_burst);  if (police-&gt;tcfp_P_tab) {// 如果存在峰值流控   ptoks = toks + police-&gt;tcfp_ptoks;// MTU对应的峰值令牌   if (ptoks &gt; (long)L2T_P(police, police-&gt;tcfp_mtu))    ptoks = (long)L2T_P(police, police-&gt;tcfp_mtu);// 减去当前数据包长对应的峰值令牌数   ptoks -= L2T_P(police, skb-&gt;len);  }// 令牌增加原来的桶里的令牌  toks += police-&gt;tcfp_toks;// 限制令牌值不超过burst值  if (toks &gt; (long)police-&gt;tcfp_burst)   toks = police-&gt;tcfp_burst;// 令牌数减去数据包长对应的令牌数  toks -= L2T(police, skb-&gt;len);  if ((toks|ptoks) &gt;= 0) {// 令牌数大于0, 在流量限制范围内// 更新时间和令牌   police-&gt;tcfp_t_c = now;   police-&gt;tcfp_toks = toks;   police-&gt;tcfp_ptoks = ptoks;   spin_unlock(&police-&gt;tcf_lock);// 返回不超过流控限制下的动作处理结果   return police-&gt;tcfp_result;  } }// 超过流量限制了 police-&gt;tcf_qstats.overlimits++; spin_unlock(&police-&gt;tcf_lock);// 返回动作处理结果 return police-&gt;tcf_action;} 8.12.4 输出static inttcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref){// 数据包缓冲区起始位置 unsigned char  *b = skb-&gt;tail;// police结构为a的私有数据 struct tcf_police *police = a-&gt;priv;// 选项参数, 中间数据 struct tc_police opt;// 填写选项参数// 索引号 opt.index = police-&gt;tcf_index;// 动作 opt.action = police-&gt;tcf_action;// MTU opt.mtu = police-&gt;tcfp_mtu;// 爆发值 opt.burst = police-&gt;tcfp_burst;// 引用数 opt.refcnt = police-&gt;tcf_refcnt - ref;// 绑定数 opt.bindcnt = police-&gt;tcf_bindcnt - bind;// 速率控制 if (police-&gt;tcfp_R_tab)  opt.rate = police-&gt;tcfp_R_tab-&gt;rate; else  memset(&opt.rate, 0, sizeof(opt.rate));// 峰值速率控制 if (police-&gt;tcfp_P_tab)  opt.peakrate = police-&gt;tcfp_P_tab-&gt;rate; else  memset(&opt.peakrate, 0, sizeof(opt.peakrate));// 将选项参数填写到skb数据包 RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);// 处理结果 if (police-&gt;tcfp_result)  RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int),   &police-&gt;tcfp_result);#ifdef CONFIG_NET_ESTIMATOR// 估计器 if (police-&gt;tcfp_ewma_rate)  RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &police-&gt;tcfp_ewma_rate);#endif// 返回现在的数据包长度 return skb-&gt;len;rtattr_failure: skb_trim(skb, b - skb-&gt;data); return -1;} 8.12.5 清除// 就是tcf_police_release的包裹函数static int tcf_act_police_cleanup(struct tc_action *a, int bind){ struct tcf_police *p = a-&gt;priv; if (p != NULL)  return tcf_police_release(p, bind); return 0;}/* include/net/act_api.h */static inline inttcf_police_release(struct tcf_police *p, int bind){ int ret = 0;#ifdef CONFIG_NET_CLS_ACT if (p) {// 减少绑定数  if (bind)   p-&gt;tcf_bindcnt--;// 减少引用数  p-&gt;tcf_refcnt--;// 绑定数和引用数都为0, 释放police节点  if (p-&gt;tcf_refcnt &lt;= 0 && !p-&gt;tcf_bindcnt) {   tcf_police_destroy(p);   ret = 1;  } }#else if (p && --p-&gt;tcf_refcnt == 0)  tcf_police_destroy(p);#endif /* CONFIG_NET_CLS_ACT */ return ret;}/* net/sched/act_police.c */void tcf_police_destroy(struct tcf_police *p){// 根据索引号计算哈希数 unsigned int h = tcf_hash(p-&gt;tcf_index, POL_TAB_MASK); struct tcf_common **p1p;// 遍历指定的哈希链表, 查找地址匹配的common节点 for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)-&gt;tcfc_next) {  if (*p1p == &p-&gt;common) {// 找到   write_lock_bh(&police_lock);// 从链表断开   *p1p = p-&gt;tcf_next;   write_unlock_bh(&police_lock);#ifdef CONFIG_NET_ESTIMATOR// 释放估计器   gen_kill_estimator(&p-&gt;tcf_bstats,        &p-&gt;tcf_rate_est);#endif// 释放流控表   if (p-&gt;tcfp_R_tab)    qdisc_put_rtab(p-&gt;tcfp_R_tab);// 释放峰值流控表   if (p-&gt;tcfp_P_tab)    qdisc_put_rtab(p-&gt;tcfp_P_tab);// 释放节点   kfree(p);   return;  } } BUG_TRAP(0);}8.12.6 遍历// police是目前分析的唯一一个自定义遍历函数的动作, 执行删除和输出两种操作// 因为不是用系统的哈希表, 用的是自己的哈希表static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,                              int type, struct tc_action *a){ struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct rtattr *r; read_lock(&police_lock);// 要跳过的节点数 s_i = cb-&gt;args[0];// 遍历所有哈希表 for (i = 0; i &lt; (POL_TAB_MASK + 1); i++) {// 链表头, 使用tcf_hash似乎没必要, 因为i是不超过POL_TAB_MASK的  p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)];// 遍历链表  for (; p; p = p-&gt;tcfc_next) {// 统计数   index++;// 小于要跳过的节点数, 跳过   if (index &lt; s_i)    continue;// 将节点p作为a私有数据   a-&gt;priv = p;   a-&gt;order = index;// 数据包的末尾   r = (struct rtattr*) skb-&gt;tail;   RTA_PUT(skb, a-&gt;order, 0, NULL);// 执行删除或获取操作, 将前面   if (type == RTM_DELACTION)    err = tcf_action_dump_1(skb, a, 0, 1);   else    err = tcf_action_dump_1(skb, a, 0, 0);   if (err &lt; 0) {// 操作失败. 中断循环    index--;    skb_trim(skb, (u8*)r - skb-&gt;data);    goto done;   }// rtnetlink属性数据长度   r-&gt;rta_len = skb-&gt;tail - (u8*)r;// 处理过的节点增加   n_i++;  } }done: read_unlock(&police_lock);// 增加处理过的节点的数量 if (n_i)  cb-&gt;args[0] += n_i; return n_i;rtattr_failure: skb_trim(skb, (u8*)r - skb-&gt;data); goto done;}9. 总结Linux内核中的流量控制处理基本结构是Qdisc，简单情况下只用Qdisc就可以满足流控的要求，如FIFO，TBF等；但如果想对数据进行分类流控，就需要增加class和filter的相关处理，前者建立类别处理树，后者则定义什么样的数据属于什么类别，然后针对每个类别数据设置自己的Qdisc，就可以进行细粒度地流控处理了，关于action处理，窃以为意思不是很大，因为不是丢包就是发包，也不会象netfilter那样有各种各样的target处理。流控处理实现的重要特点就是功能的对象化，功能充分模块化，容易扩展，虽然是C程序，但可以认为和C++一样的实现了对象的封装处理，充分体现了OO的观念，也就是说OO是程序设计理念，而不是只限制只能由支持对象的语言实现，任何语言都可以用来实现OO设计思想。
Linux内核中游量控制(24)

热点推荐