Linux内核中netlink协议族的实现(下)

Linux内核中netlink协议族的实现(上)
本文档的Copyleft归yfydz所有，使用GPL发布，可以自由拷贝，转载，转载时请保持文档的完整性，严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源：http://yfydz.cublog.cn
1. 前言netlink协议族是Linux内核网络部分的一个固定部分, 一旦在内核配置中选了网络支持就自动带了而不能单独去掉。netlink的实现源码在net/netlink目录下，主要是net/netlink/af_netlink.c文件。以下内核代码版本为2.6.19.2, 如无特别说明代码取自net/netlink/af_netlink.c。2. 数据结构netlink套接口结构:/* net/netlink/af_netlink.c */struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock  sk; u32   pid; // 自己的pid, 通常是0 u32   dst_pid; // 对方的pid u32   dst_group; // 对方的组 u32   flags; u32   subscriptions; u32   ngroups; // 多播组数量 unsigned long  *groups; // 多播组号 unsigned long  state; wait_queue_head_t wait; // 等待队列,用于处理接收发送包时的top half struct netlink_callback *cb;  // 回调结构,包含回调函数 spinlock_t  cb_lock; void   (*data_ready)(struct sock *sk, int bytes); // 数据到达时                                //的操作, netlink可有不同类型, 如ROUTE, FIREWALL, ARPD等,                                  //每种类型都自己定义的data_ready处理 struct module  *module;};这个结构先是包含一个标准的struct sock结构,后面又跟和netlink相关的特有相关数据,内核中其他协议的sock也是类似定义的, 注意sock结构必须放在第一位,这是为了可以直接将sock的指针转为netlink_sock的指针。 netlink sock的表:struct netlink_table { struct nl_pid_hash hash; // 根据pid进行HASH的netlink sock链表, 相当于客户端链表 struct hlist_head mc_list; // 多播的sock链表 unsigned long *listeners;  // 监听者标志 unsigned int nl_nonroot; unsigned int groups; // 每个netlink的协议类型可以定义多个组, 8的倍数,最小是32 struct module *module; int registered;};最大可有MAX_LINKS(32)个表，处理不同协议类型的netlink套接口, 注意由于是自身的通信, 本机同时作为服务器和客户端, 服务端需要一个套接口对应, 每个客户端也要有一个套接口对应, 多个客户端的套接口形成一个链表.struct nl_pid_hash { struct hlist_head *table; // 链表节点 unsigned long rehash_time; // 重新计算HASH的时间间隔 unsigned int mask; unsigned int shift; unsigned int entries;  // 链表节点数 unsigned int max_shift; // 最大幂值 u32 rnd; // 随机数};其他和netlink数据相关的数据结构在include/linux/netlink.h中定义, 不过这些结构更多用在各具体的netlink对象的实现中, 在基本netlink套接口中到是用得不多。3. af_netlink协议初始化static int __init netlink_proto_init(void){ struct sk_buff *dummy_skb; int i; unsigned long max; unsigned int order;// 登记netlink_proto结构, 该结构定义如下:// static struct proto netlink_proto = {//  .name   = "NETLINK",//  .owner   = THIS_MODULE,//  .obj_size = sizeof(struct netlink_sock),// };// 最后一个参数为0, 表示不进行slab的分配, 只是简单的将netlink_proto结构// 挂接到系统的网络协议链表中,这个结构最主要是告知了netlink sock结构的大小 int err = proto_register(&netlink_proto, 0); if (err != 0)  goto out; BUILD_BUG_ON(sizeof(struct netlink_skb_parms) &gt; sizeof(dummy_skb-&gt;cb));// 分配MAX_LINKS个netlink表结构 nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table)  goto panic;// 以下根据系统内存大小计算最大链表元素个数// PAGE_SHIFT是每页大小的2的幂,对i386是12,即每页是4K,2^12// 对于128M内存的机器,max计算值是(128*1024) &gt;&gt; (21-12) = 256// 对于64M内存的机器,max计算值是(64*1024) &gt;&gt; (23-12) = 32 if (num_physpages &gt;= (128 * 1024))  max = num_physpages &gt;&gt; (21 - PAGE_SHIFT); else  max = num_physpages &gt;&gt; (23 - PAGE_SHIFT);// 根据max再和PAGE_SHIFT计算总内存空间相应的幂值order order = get_bitmask_order(max) - 1 + PAGE_SHIFT;// max是最大节点数 max = (1UL &lt;&lt; order) / sizeof(struct hlist_head);// order是max对于2的幂数 order = get_bitmask_order(max &gt; UINT_MAX ? UINT_MAX : max) - 1; for (i = 0; i &lt; MAX_LINKS; i++) {  struct nl_pid_hash *hash = &nl_table[i].hash;// 为netlink的每个协议类型分配HASH表链表头  hash-&gt;table = nl_pid_hash_alloc(1 * sizeof(*hash-&gt;table));  if (!hash-&gt;table) {   while (i-- &gt; 0)    nl_pid_hash_free(nl_table[i].hash.table,       1 * sizeof(*hash-&gt;table));   kfree(nl_table);   goto panic;  }// 初始化HASH表参数  memset(hash-&gt;table, 0, 1 * sizeof(*hash-&gt;table));// 最大幂数  hash-&gt;max_shift = order;  hash-&gt;shift = 0;  hash-&gt;mask = 0;  hash-&gt;rehash_time = jiffies; }// 登记netlink协议族的的操作结构 sock_register(&netlink_family_ops);#ifdef CONFIG_PROC_FS proc_net_fops_create("netlink", 0, &netlink_seq_fops);#endif /* The netlink device handler may be needed early. */// 初始化路由netlink rtnetlink_init();out: return err;panic: panic("netlink_init: Cannot allocate nl_table\n");}core_initcall(netlink_proto_init); 4. 建立netlink套接口4.1  建立对应客户端的套接口// netlink协议族操作, 在用户程序使用socket打开netlink类型的socket时调用,// 相应的create函数在__sock_create(net/socket.c)函数中调用:static struct net_proto_family netlink_family_ops = { .family = PF_NETLINK, .create = netlink_create, .owner = THIS_MODULE, /* for consistency 8) */};// 在用户空间每次打开netlink socket时都会调用此函数:static int netlink_create(struct socket *sock, int protocol){ struct module *module = NULL; struct netlink_sock *nlk; unsigned int groups; int err = 0;// sock状态初始化 sock-&gt;state = SS_UNCONNECTED;// 对netlink sock的类型和协议(实际是netlink_family类型)限制检查 if (sock-&gt;type != SOCK_RAW && sock-&gt;type != SOCK_DGRAM)  return -ESOCKTNOSUPPORT; if (protocol&lt;0 || protocol &gt;= MAX_LINKS)  return -EPROTONOSUPPORT; netlink_lock_table();#ifdef CONFIG_KMOD// 如果相应的netlink协议是模块又没有加载的话先加载该模块 if (!nl_table[protocol].registered) {  netlink_unlock_table();  request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);  netlink_lock_table(); }#endif if (nl_table[protocol].registered &&     try_module_get(nl_table[protocol].module))  module = nl_table[protocol].module;// groups这个值在函数后面也没见用上, 这句没意义 groups = nl_table[protocol].groups; netlink_unlock_table();// 真正的建立netlink sock的函数 if ((err = __netlink_create(sock, protocol)) &lt; 0)  goto out_module; nlk = nlk_sk(sock-&gt;sk); nlk-&gt;module = module;out: return err;out_module: module_put(module); goto out;}// 基本函数static int __netlink_create(struct socket *sock, int protocol){ struct sock *sk; struct netlink_sock *nlk;// netlink sock的基本操作 sock-&gt;ops = &netlink_ops;// 分配sock结构, 通过netlink_proto中的obj_size指出了netlink sock的大小 sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); if (!sk)  return -ENOMEM;// 初始化sock基本数据, 将sock和socket关联起来 sock_init_data(sock, sk);// 将普通sock转为netlink sock,实际只是重新定义的一下指针类型,指针本身值不变 nlk = nlk_sk(sk);// 初始化sock的锁 spin_lock_init(&nlk-&gt;cb_lock);// 初始化等待队列 init_waitqueue_head(&nlk-&gt;wait);// sock的析构函数,释放接收队列中的skb数据包 sk-&gt;sk_destruct = netlink_sock_destruct; sk-&gt;sk_protocol = protocol;// 注意这里没有重新定义sk的sk_data_ready函数// 在sock_init_data()函数中将sk_data_ready定义为sock_def_readable()函数 return 0;}用户空间使用socket(2)系统调用打开netlink类型的套接口时, 在内核中会调用sys_sock()函数, 然后是调用__sock_create()函数, 在其中调用netlink协议族的create()函数, 即netlink_create()函数. 4.2 建立服务器端的套接口以前也介绍过另一个建立netlink sock的函数netlink_kernel_create, 一般是在netlink的各种协议类型模块初始化时调用的, 而不是socket系统调用时调用的, 每个netlink协议初始化是只调用一次, 建立一个内核中的netlink接口, 相当于服务器, 其中也调用了__netlink_create()函数:/* * We export these functions to other modules. They provide a * complete set of kernel non-blocking support for message * queueing. */struct sock *netlink_kernel_create(int unit, unsigned int groups,                      void (*input)(struct sock *sk, int len),                      struct module *module){ struct socket *sock; struct sock *sk; struct netlink_sock *nlk; unsigned long *listeners = NULL; BUG_ON(!nl_table); if (unit&lt;0 || unit&gt;=MAX_LINKS)  return NULL;// 这里的lite表示只是简单分配一个socket,没有真正初始化 if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))  return NULL;// 用这个lite sock再建立netlink sock if (__netlink_create(sock, unit) &lt; 0)  goto out_sock_release; if (groups &lt; 32)  groups = 32;// listerns是个位图对应groups中每个元素 listeners = kzalloc(NLGRPSZ(groups), GFP_KERNEL); if (!listeners)  goto out_sock_release; sk = sock-&gt;sk;// 重新定义了sk_data_ready函数 sk-&gt;sk_data_ready = netlink_data_ready;// 这个是相应的各netlink协议数据处理函数 if (input)  nlk_sk(sk)-&gt;data_ready = input; if (netlink_insert(sk, 0))  goto out_sock_release; nlk = nlk_sk(sk); nlk-&gt;flags |= NETLINK_KERNEL_SOCKET; netlink_table_grab();// 注册到相应unit的netlink协议表中 nl_table[unit].groups = groups; nl_table[unit].listeners = listeners; nl_table[unit].module = module;// 该标志表示该项被登记 nl_table[unit].registered = 1; netlink_table_ungrab(); return sk;out_sock_release: kfree(listeners); sock_release(sock); return NULL;}5. netlink套接口的操作在__netlink_create函数中定义了netlink套接口的操作结构为netlink_ops: sock-&gt;ops = &netlink_ops;该结构定义如下:static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, .bind =  netlink_bind, .connect = netlink_connect, .socketpair = sock_no_socketpair, // 无定义 .accept = sock_no_accept, // 无定义 .getname = netlink_getname, .poll =  datagram_poll, .ioctl = sock_no_ioctl, // 无定义 .listen = sock_no_listen, // 无定义 .shutdown = sock_no_shutdown, // 无定义 .setsockopt = netlink_setsockopt, .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap =  sock_no_mmap, // 无定义 .sendpage = sock_no_sendpage, // 无定义};5.1 释放在close(2)时调用static int netlink_release(struct socket *sock){ struct sock *sk = sock-&gt;sk; struct netlink_sock *nlk; if (!sk)  return 0;// 将套接口sk从系统sk链表和绑定链表中断开 netlink_remove(sk); nlk = nlk_sk(sk); spin_lock(&nlk-&gt;cb_lock); if (nlk-&gt;cb) {// 释放netlink控制块处理  if (nlk-&gt;cb-&gt;done)   nlk-&gt;cb-&gt;done(nlk-&gt;cb);  netlink_destroy_callback(nlk-&gt;cb);  nlk-&gt;cb = NULL; } spin_unlock(&nlk-&gt;cb_lock); /* OK. Socket is unlinked, and, therefore,    no new packets will arrive */// 设置sk状态为SOCK_DEAD, 断开sock和sk的互指 sock_orphan(sk); sock-&gt;sk = NULL;// 唤醒所有等待队列 wake_up_interruptible_all(&nlk-&gt;wait);// 清空写队列 skb_queue_purge(&sk-&gt;sk_write_queue); if (nlk-&gt;pid && !nlk-&gt;subscriptions) {// 发送释放通知  struct netlink_notify n = {      .protocol = sk-&gt;sk_protocol,      .pid = nlk-&gt;pid,       };  atomic_notifier_call_chain(&netlink_chain,    NETLINK_URELEASE, &n); } // 减少模块计数 if (nlk-&gt;module)  module_put(nlk-&gt;module);// 相当于加锁 netlink_table_grab(); if (nlk-&gt;flags & NETLINK_KERNEL_SOCKET) {// 释放内核中的netlink服务器端  kfree(nl_table[sk-&gt;sk_protocol].listeners);  nl_table[sk-&gt;sk_protocol].module = NULL;  nl_table[sk-&gt;sk_protocol].registered = 0; } else if (nlk-&gt;subscriptions)  netlink_update_listeners(sk);// 相当于解锁 netlink_table_ungrab();// 释放该netlink sock的多播组 kfree(nlk-&gt;groups); nlk-&gt;groups = NULL;// 释放sock sock_put(sk); return 0;}5.2 绑定bind绑定通常是针对服务端static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len){ struct sock *sk = sock-&gt;sk; struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err;// 检查一下地址的协议族是否为AF_NETLINK  if (nladdr-&gt;nl_family != AF_NETLINK)  return -EINVAL; /* Only superuser is allowed to listen multicasts */ if (nladdr-&gt;nl_groups) {// 指定了多播组, 这是需要root权限  if (!netlink_capable(sock, NL_NONROOT_RECV))   return -EPERM;  if (nlk-&gt;groups == NULL) {// 分配多播组空间   err = netlink_alloc_groups(sk);   if (err)    return err;  } } if (nlk-&gt;pid) {// 如果sock的pid非0, 检查是否匹配在nladdr地址结构中指定的pid  if (nladdr-&gt;nl_pid != nlk-&gt;pid)   return -EINVAL; } else {// sock的pid为0, 根据nladdr是否指定pid来执行插入或  err = nladdr-&gt;nl_pid ?   netlink_insert(sk, nladdr-&gt;nl_pid) :   netlink_autobind(sock);  if (err)   return err; }// 非多播情况时就可以返回成功了 if (!nladdr-&gt;nl_groups && (nlk-&gt;groups == NULL || !(u32)nlk-&gt;groups[0]))  return 0; netlink_table_grab();// 多播情况下更新sock参数 netlink_update_subscriptions(sk, nlk-&gt;subscriptions +                                  hweight32(nladdr-&gt;nl_groups) -                                  hweight32(nlk-&gt;groups[0])); nlk-&gt;groups[0] = (nlk-&gt;groups[0] & ~0xffffffffUL) | nladdr-&gt;nl_groups; netlink_update_listeners(sk); netlink_table_ungrab(); return 0;}// 根据pid插入static int netlink_insert(struct sock *sk, u32 pid){// netlink相应协议的HASH结构 struct nl_pid_hash *hash = &nl_table[sk-&gt;sk_protocol].hash; struct hlist_head *head;// 缺省错误为地址已经被使用 int err = -EADDRINUSE; struct sock *osk; struct hlist_node *node; int len; netlink_table_grab();// 根据pid查找相应HASH链表头 head = nl_pid_hashfn(hash, pid); len = 0;// 检查pid是否已经在链表中, 有则失败 sk_for_each(osk, node, head) {  if (nlk_sk(osk)-&gt;pid == pid)   break;  len++; } if (node)  goto err;// 缺省错误改为系统忙 err = -EBUSY;// 如果sock的pid不为0, 错误, 只有pid为0的sock才能执行该函数// sock的pid不为0时不会再进行insert操作了 if (nlk_sk(sk)-&gt;pid)  goto err;// 缺省错误改为无内存空间 err = -ENOMEM; if (BITS_PER_LONG &gt; 32 && unlikely(hash-&gt;entries &gt;= UINT_MAX))  goto err;// 如果链表不为空而且链表长度数量过长,会调整HASH表,重新获取HASH链表头// 不过这种情况很少发生 if (len && nl_pid_hash_dilute(hash, len))  head = nl_pid_hashfn(hash, pid); hash-&gt;entries++;// 将pid赋值给sock的pid参数 nlk_sk(sk)-&gt;pid = pid;// 将sock节点添加进HASH链表 sk_add_node(sk, head); err = 0;err: netlink_table_ungrab(); return err;}// 未指定pid时的自动绑定// 实际是选一个没用过的pid后再进行插入操作static int netlink_autobind(struct socket *sock){// 从socket找到sock struct sock *sk = sock-&gt;sk;// netlink相应协议的HASH结构 struct nl_pid_hash *hash = &nl_table[sk-&gt;sk_protocol].hash; struct hlist_head *head; struct sock *osk; struct hlist_node *node;// pid取为当前进程的组ID s32 pid = current-&gt;tgid; int err;// 有符号32位数 static s32 rover = -4097;retry: cond_resched(); netlink_table_grab();// 找合适的HASH链表头 head = nl_pid_hashfn(hash, pid); sk_for_each(osk, node, head) {// 查找链表中是否已经有该pid  if (nlk_sk(osk)-&gt;pid == pid) {// 存在, 则更新pid, 重新检查, 注意这时的pid是个负数   /* Bind collision, search negative pid values. */   pid = rover--;   if (rover &gt; -4097)    rover = -4097;   netlink_table_ungrab();   goto retry;  } } netlink_table_ungrab();// 此时的pid是一个负数转换为无符号32位数, 将是一个非常大的数// 执行正常的pid插入 err = netlink_insert(sk, pid); if (err == -EADDRINUSE)  goto retry; /* If 2 threads race to autobind, that is fine.  */ if (err == -EBUSY)  err = 0; return err;}// 更新subscriotionsstatic voidnetlink_update_subscriptions(struct sock *sk, unsigned int subscriptions){ struct netlink_sock *nlk = nlk_sk(sk); if (nlk-&gt;subscriptions && !subscriptions)  __sk_del_bind_node(sk); else if (!nlk-&gt;subscriptions && subscriptions)  sk_add_bind_node(sk, &nl_table[sk-&gt;sk_protocol].mc_list); nlk-&gt;subscriptions = subscriptions;}// 更新listenersstatic voidnetlink_update_listeners(struct sock *sk){ struct netlink_table *tbl = &nl_table[sk-&gt;sk_protocol]; struct hlist_node *node; unsigned long mask; unsigned int i; for (i = 0; i &lt; NLGRPSZ(tbl-&gt;groups)/sizeof(unsigned long); i++) {  mask = 0;// 遍历多播链表生成多播组的掩码  sk_for_each_bound(sk, node, &tbl-&gt;mc_list)   mask |= nlk_sk(sk)-&gt;groups[i];  tbl-&gt;listeners[i] = mask; } /* this function is only called with the netlink table "grabbed", which  * makes sure updates are visible before bind or setsockopt return. */}......待续.....
Linux内核中netlink协议族的实现(下)

热点推荐