Linux内核中流量控制(17)
本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源:http://yfydz.cublog.cn
7.6 tcf_proto_ops的一些相关操作7.6.1 登记和撤销/* Register(unregister) new classifier type */// 登记新的tcf_proto_ops分类操作结构int register_tcf_proto_ops(struct tcf_proto_ops *ops){ struct tcf_proto_ops *t, **tp; int rc = -EEXIST; write_lock(&cls_mod_lock);// 遍历当前tcf_proto_ops链表 for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)// 检查是否有名称相同的项, 有的话返回对象已存在错误 if (!strcmp(ops->kind, t->kind)) goto out;// 添加到链表末尾, 也是dummy header算法 ops->next = NULL; *tp = ops; rc = 0;out: write_unlock(&cls_mod_lock); return rc;}// 撤销tcf_proto_ops分类结构int unregister_tcf_proto_ops(struct tcf_proto_ops *ops){ struct tcf_proto_ops *t, **tp; int rc = -ENOENT; write_lock(&cls_mod_lock);// 遍历链表 for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next)// 直接进行tcf_proto_ops结构地址比较, 相同的话中断循环 if (t == ops) break; if (!t) goto out;// 将找到的tp节点从链表中断开, 不用释放操作, 因为这些ops其实都是静态定义的 *tp = t->next; rc = 0;out: write_unlock(&cls_mod_lock); return rc;}7.6.2 tcf扩展tcf扩展增加了对分类后数据进行某种操作的功能, 有点象netfilter的target,使用这些功能需要在配置内核时定义NET_CLS_ACT或NET_CLS_POLICE。/* include/net/pkt_cls.h */// tcf扩展结构, 如果没定义NET_CLS_ACT和NET_CLS_POLICE的话就是个空结构struct tcf_exts{#ifdef CONFIG_NET_CLS_ACT// 动作 struct tc_action *action;#elif defined CONFIG_NET_CLS_POLICE// 策略 struct tcf_police *police;#endif};/* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. */struct tcf_ext_map{ int action; int police;};/** * tcf_exts_is_predicative - check if a predicative extension is present * @exts: tc filter extensions handle * * Returns 1 if a predicative extension is present, i.e. an extension which * might cause further actions and thus overrule the regular tcf_result. */// 返回扩展结构中的元素是否为空static inline inttcf_exts_is_predicative(struct tcf_exts *exts){#ifdef CONFIG_NET_CLS_ACT// !!是为了保证返回值0或1 return !!exts->action;#elif defined CONFIG_NET_CLS_POLICE return !!exts->police;#else return 0;#endif}/** * tcf_exts_is_available - check if at least one extension is present * @exts: tc filter extensions handle * * Returns 1 if at least one extension is present. */// 实际就是cf_exts_is_predicative函数static inline inttcf_exts_is_available(struct tcf_exts *exts){ /* All non-predicative extensions must be added here. */ return tcf_exts_is_predicative(exts);}/** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer * @exts: tc filter extensions handle * @res: desired result * * Executes all configured extensions. Returns 0 on a normal execution, * a negative number if the filter must be considered unmatched or * a positive action code (TC_ACT_*) which must be returned to the * underlying layer. */static inline inttcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res){#ifdef CONFIG_NET_CLS_ACT if (exts->action) return tcf_action_exec(skb, exts->action, res);#elif defined CONFIG_NET_CLS_POLICE if (exts->police) return tcf_police(skb, exts->police);#endif return 0;} /* net/sched/cls_api.c */// 是否tcf扩展结构voidtcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts){#ifdef CONFIG_NET_CLS_ACT if (exts->action) {// 释放tcf动作 tcf_action_destroy(exts->action, TCA_ACT_UNBIND); exts->action = NULL; }#elif defined CONFIG_NET_CLS_POLICE if (exts->police) {// 释放tcf策略 tcf_police_release(exts->police, TCA_ACT_UNBIND); exts->police = NULL; }#endif} inttcf_exts_validate(struct tcf_proto *tp, struct rtattr **tb, struct rtattr *rate_tlv, struct tcf_exts *exts, struct tcf_ext_map *map){// 结构清零 memset(exts, 0, sizeof(*exts)); #ifdef CONFIG_NET_CLS_ACT { int err; struct tc_action *act;// 如果策略存在 if (map->police && tb[map->police-1]) {// 进行动作初始化, 生成新动作指针 act = tcf_action_init_1(tb[map->police-1], rate_tlv, "police", TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); if (act == NULL) return err;// TCA_OLD_COMPAT标志是策略 act->type = TCA_OLD_COMPAT; exts->action = act; } else// 如果动作存在 if (map->action && tb[map->action-1]) {// 动作初始化, 生成动作指针 act = tcf_action_init(tb[map->action-1], rate_tlv, NULL, TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); if (act == NULL) return err;// action赋值 exts->action = act; } }#elif defined CONFIG_NET_CLS_POLICE// 如果策略存在 if (map->police && tb[map->police-1]) { struct tcf_police *p;// 生成新策略结构 p = tcf_police_locate(tb[map->police-1], rate_tlv); if (p == NULL) return -EINVAL;// police赋值 exts->police = p; } else if (map->action && tb[map->action-1]) return -EOPNOTSUPP;#else if ((map->action && tb[map->action-1]) || (map->police && tb[map->police-1])) return -EOPNOTSUPP;#endif return 0;}// 修改扩展结构, 将src中的参数填到dst中voidtcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, struct tcf_exts *src){#ifdef CONFIG_NET_CLS_ACT// 动作操作 if (src->action) { struct tc_action *act; tcf_tree_lock(tp);// dst和src的action进行交换, 返回原来dst->action act = xchg(&dst->action, src->action); tcf_tree_unlock(tp);// 如果原来的action非空, 释放之 if (act) tcf_action_destroy(act, TCA_ACT_UNBIND); }#elif defined CONFIG_NET_CLS_POLICE// 策略操作 if (src->police) { struct tcf_police *p; tcf_tree_lock(tp);// dst和src的police进行交换, 返回原来dst->police p = xchg(&dst->police, src->police); tcf_tree_unlock(tp);// 如果原来的police非空, 释放之 if (p) tcf_police_release(p, TCA_ACT_UNBIND); }#endif}// 输出扩展结构inttcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_ext_map *map){#ifdef CONFIG_NET_CLS_ACT// 输出动作 if (map->action && exts->action) {// 参数存在 /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */// 在数据包缓冲区中定位 struct rtattr * p_rta = (struct rtattr*) skb->tail; if (exts->action->type != TCA_OLD_COMPAT) {// 类型是动作 RTA_PUT(skb, map->action, 0, NULL);// 动作输出 if (tcf_action_dump(skb, exts->action, 0, 0) < 0) goto rtattr_failure;// 数据长度 p_rta->rta_len = skb->tail - (u8*)p_rta; } else if (map->police) {// 类型是策略 RTA_PUT(skb, map->police, 0, NULL);// 策略输出 if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) goto rtattr_failure;// 数据长度 p_rta->rta_len = skb->tail - (u8*)p_rta; } }#elif defined CONFIG_NET_CLS_POLICE// 输出策略 if (map->police && exts->police) {// 策略存在// 在数据包缓冲区中定位 struct rtattr * p_rta = (struct rtattr*) skb->tail; RTA_PUT(skb, map->police, 0, NULL);// 策略输出 if (tcf_police_dump(skb, exts->police) < 0) goto rtattr_failure;// 数据长度 p_rta->rta_len = skb->tail - (u8*)p_rta; }#endif return 0;rtattr_failure: __attribute__ ((unused)) return -1;} // 输出统计值inttcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_ext_map *map){#ifdef CONFIG_NET_CLS_ACT// 动作统计 if (exts->action)// 输出策略统计值到skb if (tcf_action_copy_stats(skb, exts->action, 1) < 0) goto rtattr_failure;#elif defined CONFIG_NET_CLS_POLICE// 策略统计 if (exts->police)// 输出策略统计值到skb if (tcf_police_dump_stats(skb, exts->police) < 0) goto rtattr_failure;#endif return 0;rtattr_failure: __attribute__ ((unused)) return -1;} 7.7 TC分类操作下面看一下分类函数是如何被调用的, 分类操作通过tc_classify()函数完成, 在以前介绍的各种分类流控算法中都见过该函数:/* net/sched/sch_api.c *//* Main classifier routine: scans classifier chain attached to this qdisc, (optionally) tests for protocol and asks specific classifiers. *///int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res){ int err = 0;// 协议, 这里应该是数据链路层中的协议值, 如IP包就是0x0800 u32 protocol = skb->protocol;#ifdef CONFIG_NET_CLS_ACT struct tcf_proto *otp = tp;reclassify:#endif protocol = skb->protocol;// 遍历分类规则链表 for ( ; tp; tp = tp->next) {// 首先要求协议匹配 if ((tp->protocol == protocol || tp->protocol == __constant_htons(ETH_P_ALL)) &&// 然后调用tcf_proto中的分类函数进行处理, 该函数实际就是tcf_proto_ops的分类函数// 分类结构>=0表示分类成功 (err = tp->classify(skb, tp, res)) >= 0) {#ifdef CONFIG_NET_CLS_ACT// 返回结果是需要重新分类 if ( TC_ACT_RECLASSIFY == err) {// 将skb中的tc_verd值转换为判断值, 实际是个计数器 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd); tp = otp;// 转换次数太多, 返回丢包 if (MAX_REC_LOOP < verd++) { printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n", tp->prio&0xffff, ntohs(tp->protocol)); return TC_ACT_SHOT; }// 转换回tc_verd skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);// 重新分类操作 goto reclassify; } else {// 非重新分类的话, 更新tc_verd, 返回分类结果 if (skb->tc_verd) skb->tc_verd = SET_TC_VERD(skb->tc_verd,0); return err; }#else// 如果内核没定义NET_CLS_ACT, 直接返回分类操作结果 return err;#endif } }// 循环退出, 分类失败 return -1;}tc_classify()的核心函数就是tp->classify()函数 7.8 fw过滤操作结构fw分类方法主要是根据skb中nfmark参数来进行数据分类, 而该参数是由netfilter定义的, 如果内核里没有定义netfilter, 这该分类方法意义不大,该分类方法在 net/sched/cls_fw.c 中定义。7.8.1 结构定义static struct tcf_proto_ops cls_fw_ops = {// 这个参数可以不用明确写出来, 这种定义方法参数缺省就是0了 .next = NULL,// 名称 .kind = "fw",// 各种操作函数 .classify = fw_classify, .init = fw_init, .destroy = fw_destroy, .get = fw_get, .put = fw_put, .change = fw_change, .delete = fw_delete, .walk = fw_walk, .dump = fw_dump, .owner = THIS_MODULE,};// 哈希表数量, 空间限制为一页内存, x86是4K, 32系统指针是4字节, 因此应该是1024个#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))// 链表头struct fw_head{ struct fw_filter *ht[HTSIZE]; u32 mask;};// fw过滤器struct fw_filter{ struct fw_filter *next; u32 id; struct tcf_result res;#ifdef CONFIG_NET_CLS_IND char indev[IFNAMSIZ];#endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts;};static struct tcf_ext_map fw_ext_map = { .action = TCA_FW_ACT, .police = TCA_FW_POLICE};7.8.2 初始化// 空函数static int fw_init(struct tcf_proto *tp){ return 0;} 7.8.3 分类// 计算哈希值static __inline__ int fw_hash(u32 handle){// 如果是4096, 2^12, 按8:12:12分割异或 if (HTSIZE == 4096) return ((handle >> 24) & 0xFFF) ^ ((handle >> 12) & 0xFFF) ^ (handle & 0xFFF);// 2048, 2^11, 按10:11:11分割异或 else if (HTSIZE == 2048) return ((handle >> 22) & 0x7FF) ^ ((handle >> 11) & 0x7FF) ^ (handle & 0x7FF);// 1024, 2^10, 按12:10:10分割异或 else if (HTSIZE == 1024) return ((handle >> 20) & 0x3FF) ^ ((handle >> 10) & 0x3FF) ^ (handle & 0x3FF);// 512, 2^9, 按5:9:9:9分割异或 else if (HTSIZE == 512) return (handle >> 27) ^ ((handle >> 18) & 0x1FF) ^ ((handle >> 9) & 0x1FF) ^ (handle & 0x1FF);// 256, 2^8, 按8:8:8:8分割异或 else if (HTSIZE == 256) { u8 *t = (u8 *) &handle; return t[0] ^ t[1] ^ t[2] ^ t[3]; } else return handle & (HTSIZE - 1);} // fw分类方法, 返回负数表示分类失败, 返回0表示分类成功,分类结果在res中返回static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res){// HASH链表头 struct fw_head *head = (struct fw_head*)tp->root; struct fw_filter *f; int r;#ifdef CONFIG_NETFILTER// 如果定义了netfilter, 使用nfmark作为ID, 用掩码掩一下 u32 id = skb->nfmark & head->mask;#else// 否则ID为0,也就是说如果内核中没定义netfilter,分类的意义不大,都是同一类数据包 u32 id = 0;#endif if (head != NULL) {// 根据id进行hash, 遍历合适的链表 for (f=head->ht[fw_hash(id)]; f; f=f->next) {// 如果ID相同, 合适的话可以返回 if (f->id == id) { *res = f->res;#ifdef CONFIG_NET_CLS_IND// 网卡设备匹配 if (!tcf_match_indev(skb, f->indev)) continue;#endif /* CONFIG_NET_CLS_IND */// 如果没有定义NET_CLS_ACT和NET_CLS_POLICE的话就是个空函数, 返回0 r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; return r; } } } else {// 老分类方法, id非0, id高16为0或和Qdisc的handle的高16位相同时, 分类成功 /* old method */ if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) { res->classid = id; res->class = 0; return 0; } } return -1;}7.8.4 释放static void fw_destroy(struct tcf_proto *tp){// tcf_proto的根链表头置零, 原值保存准备用于释放 struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL); struct fw_filter *f; int h;// 如果链表为空, 返回 if (head == NULL) return;// 遍历所有哈希表 for (h=0; h<HTSIZE; h++) {// 遍历链表 while ((f=head->ht[h]) != NULL) { head->ht[h] = f->next;// 释放该filter fw_delete_filter(tp, f); } } kfree(head);}// 释放fw过滤器节点static inline voidfw_delete_filter(struct tcf_proto *tp, struct fw_filter *f){// 将过滤器和tcf_proto断开, 调用Qdisc_class_ops中的unbind_tcf()成员函数 tcf_unbind_filter(tp, &f->res);// 释放tcf扩展元素 tcf_exts_destroy(tp, &f->exts);// 释放fw过滤器内存 kfree(f);} 7.8.5 获取过滤器static unsigned long fw_get(struct tcf_proto *tp, u32 handle){ struct fw_head *head = (struct fw_head*)tp->root; struct fw_filter *f; if (head == NULL) return 0;// 用handle进行哈希, 遍历指定的hash表 for (f=head->ht[fw_hash(handle)]; f; f=f->next) {// 如果有filter的id和handle相同, 返回 if (f->id == handle) return (unsigned long)f; } return 0;}7.8.6 放弃过滤器// 空函数static void fw_put(struct tcf_proto *tp, unsigned long f){}7.8.7 参数修改// 新建, 修改都通过该函数完成static int fw_change(struct tcf_proto *tp, unsigned long base, u32 handle, struct rtattr **tca, unsigned long *arg){// 根哈希节点 struct fw_head *head = (struct fw_head*)tp->root;// fw过滤器指针 struct fw_filter *f = (struct fw_filter *) *arg;// 选项参数 struct rtattr *opt = tca[TCA_OPTIONS-1]; struct rtattr *tb[TCA_FW_MAX]; int err;// 如果没提供选项, 在提供了handle的情况下错误, 否则返回成功 if (!opt) return handle ? -EINVAL : 0;// 解析选项参数是否合法 if (rtattr_parse_nested(tb, TCA_FW_MAX, opt) < 0) return -EINVAL;// fw过滤器非空, 修改操作 if (f != NULL) {// 修改的情况下, 如果handle值非0, 而且和fw过滤器的id不同的话, 返回参数错误 if (f->id != handle && handle) return -EINVAL;// 进行参数修改操作 return fw_change_attrs(tp, f, tb, tca, base); }// 新建fw过滤器的情况, 如果handle为0, 返回参数错误 if (!handle) return -EINVAL;// 链表头为空, 第一次操作 if (head == NULL) {// 缺省掩码 u32 mask = 0xFFFFFFFF;// 如果在命令参数中定义了掩码, 获取之 if (tb[TCA_FW_MASK-1]) { if (RTA_PAYLOAD(tb[TCA_FW_MASK-1]) != sizeof(u32)) return -EINVAL; mask = *(u32*)RTA_DATA(tb[TCA_FW_MASK-1]); }// 分配链表头空间 head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); if (head == NULL) return -ENOBUFS;// 掩码 head->mask = mask; tcf_tree_lock(tp);// 作为系统的根哈希链表头 tp->root = head; tcf_tree_unlock(tp); }// 分配新的fw过滤器结构指针 f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); if (f == NULL) return -ENOBUFS;// 使用handle值作为fw过滤器的ID f->id = handle;// 调用修改函数进行赋值操作 err = fw_change_attrs(tp, f, tb, tca, base); if (err < 0) goto errout;// 添加到合适的hash链表的头, 注意锁的使用 f->next = head->ht[fw_hash(handle)]; tcf_tree_lock(tp); head->ht[fw_hash(handle)] = f; tcf_tree_unlock(tp);// 将fw过滤器作为参数返回 *arg = (unsigned long)f; return 0;errout: kfree(f); return err;} // 参数修改处理static intfw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, struct rtattr **tb, struct rtattr **tca, unsigned long base){ struct fw_head *head = (struct fw_head *)tp->root; struct tcf_exts e; u32 mask; int err;// tcf扩展验证操作 err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &fw_ext_map); if (err < 0) return err; err = -EINVAL;// 命令参数中提供了类别ID if (tb[TCA_FW_CLASSID-1]) { if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != sizeof(u32)) goto errout;// 类别ID赋值 f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); tcf_bind_filter(tp, &f->res, base); }#ifdef CONFIG_NET_CLS_IND// 网卡设备 if (tb[TCA_FW_INDEV-1]) { err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV-1]); if (err < 0) goto errout; }#endif /* CONFIG_NET_CLS_IND */// FW掩码 if (tb[TCA_FW_MASK-1]) { if (RTA_PAYLOAD(tb[TCA_FW_MASK-1]) != sizeof(u32)) goto errout; mask = *(u32*)RTA_DATA(tb[TCA_FW_MASK-1]); if (mask != head->mask) goto errout; } else if (head->mask != 0xFFFFFFFF) goto errout;// 将e中的数据赋值到f->exts中 tcf_exts_change(tp, &f->exts, &e); return 0;errout: tcf_exts_destroy(tp, &e); return err;} 7.8.8 删除// 将fw过滤器节点从系统哈希链表中断开, 释放节点// 注意一定要在系统链表中真正找到该节点才进行释放操作, 否则失败static int fw_delete(struct tcf_proto *tp, unsigned long arg){// 根节点 struct fw_head *head = (struct fw_head*)tp->root;// 要删除的fw过滤器节点 struct fw_filter *f = (struct fw_filter*)arg; struct fw_filter **fp; if (head == NULL || f == NULL) goto out;// 根据FW过滤器节点的ID哈希, 遍历相应的哈希链表 for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {// 找到该节点 if (*fp == f) { tcf_tree_lock(tp);// 将该节点从链表中断开 *fp = f->next; tcf_tree_unlock(tp);// 释放fw过滤器节点 fw_delete_filter(tp, f); return 0; } }out: return -EINVAL;}7.8.9 输出static int fw_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t){ struct fw_head *head = (struct fw_head *)tp->root; struct fw_filter *f = (struct fw_filter*)fh;// 准备填数据的skb缓冲区起始位置 unsigned char *b = skb->tail; struct rtattr *rta;// fw过滤器为空,直接返回 if (f == NULL) return skb->len;// 将fw过滤器的ID作为handle t->tcm_handle = f->id;// 检查一下fw过滤器是否合法 if (!f->res.classid && !tcf_exts_is_available(&f->exts)) return skb->len; rta = (struct rtattr*)b; RTA_PUT(skb, TCA_OPTIONS, 0, NULL);// 如果类别ID非0, 填充之到缓冲区 if (f->res.classid) RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid);#ifdef CONFIG_NET_CLS_IND// 网卡非空,填网卡名称到缓冲区 if (strlen(f->indev)) RTA_PUT(skb, TCA_FW_INDEV, IFNAMSIZ, f->indev);#endif /* CONFIG_NET_CLS_IND */// 掩码非全1值,填充到缓冲区 if (head->mask != 0xFFFFFFFF) RTA_PUT(skb, TCA_FW_MASK, 4, &head->mask);// 扩展元素的(动作/策略)输出 if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) goto rtattr_failure;// 新增的数据长度 rta->rta_len = skb->tail - b;// 输出统计参数 if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) goto rtattr_failure;// 返回数据包当前的总长度 return skb->len;rtattr_failure: skb_trim(skb, b - skb->data); return -1;} ...... 待续 ......