ip_vs实现分析(1)
本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源:http://yfydz.cublog.cn
1. 前言ipvs是章文嵩先生主持的一个开源项目,早在2.2内核时就已经以内核补丁形式出现,RedHat6.1和6.0的一个重要区别就是增加了 IPVS。从2.4.24后IPVS已经成为Linux官方标准内核的一部分,2.2时IPVS是完全独立的部分,2.4以后借用了netfilter的一些处理机制,但主体还是比较独立,但功能和netfilter有重复的地方。IPVS官方网站为: http://www.linuxvirtualserver.org以下内核代码版本2.6.17.11, ipvs版本为1.2.1。2. IPVS的外部表现根据LVS官方网站的介绍,LVS支持三种负载均衡模式:NAT,tunnel和direct routing(DR)。NAT是通用模式,所有交互数据必须通过均衡器;后两种则是一种半连接处理方式,请求数据通过均衡器,而服务器的回应则是直接路由返回的,而这两种方法的区别是tunnel模式下由于进行了IP封装所以可路由,而DR方式是修改MAC地址来实现,所以必须同一网段。3. 几个重要结构3.1 协议这个结构用来描述IPVS支持的IP协议。IPVS的IP层协议支持TCP, UDP, AH和ESP这4种IP层协议struct ip_vs_protocol {// 链表中的下一项 struct ip_vs_protocol *next;// 协议名称, "TCP", "UDP"... char *name;// 协议值: 6, 17, ... __u16 protocol;// 不进行分配 int dont_defrag;// 协议应用计数器,也据是该协议的中多连接协议的数量 atomic_t appcnt; /* counter of proto app incs */// 协议各状态的超时数组 int *timeout_table; /* protocol timeout table */// 协议初始化 void (*init)(struct ip_vs_protocol *pp);// 协议释放 void (*exit)(struct ip_vs_protocol *pp);// 协议调度 int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp);// 查找in方向的IPVS连接 struct ip_vs_conn * (*conn_in_get)(const struct sk_buff *skb, struct ip_vs_protocol *pp, const struct iphdr *iph, unsigned int proto_off, int inverse);// 查找out方向的IPVS连接 struct ip_vs_conn * (*conn_out_get)(const struct sk_buff *skb, struct ip_vs_protocol *pp, const struct iphdr *iph, unsigned int proto_off, int inverse);// 源NAT操作 int (*snat_handler)(struct sk_buff **pskb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp);// 目的NAT操作 int (*dnat_handler)(struct sk_buff **pskb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp);// 协议校验和计算 int (*csum_check)(struct sk_buff *skb, struct ip_vs_protocol *pp);// 当前协议状态名称: 如"LISTEN", "ESTABLISH"... const char *(*state_name)(int state);// 协议状态迁移 int (*state_transition)(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_protocol *pp);// 登记应用 int (*register_app)(struct ip_vs_app *inc);// 去除应用登记 void (*unregister_app)(struct ip_vs_app *inc); int (*app_conn_bind)(struct ip_vs_conn *cp);// 数据包打印 void (*debug_packet)(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg);// 调整超时 void (*timeout_change)(struct ip_vs_protocol *pp, int flags);// 设置各种状态下的协议超时 int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to);};3.2 IPVS连接这个结构用来描述IPVS的连接。IPVS的连接和netfilter定义的连接类似/* * IP_VS structure allocated for each dynamically scheduled connection */struct ip_vs_conn {// HASH链表 struct list_head c_list; /* hashed list heads */ /* Protocol, addresses and port numbers */// 客户机地址 __u32 caddr; /* client address */// 服务器对外的虚拟地址 __u32 vaddr; /* virtual address */// 服务器实际地址 __u32 daddr; /* destination address */// 客户端的端口 __u16 cport;// 服务器对外虚拟端口 __u16 vport;// 服务器实际端口 __u16 dport;// 协议类型 __u16 protocol; /* Which protocol (TCP/UDP) */ /* counter and timer */// 连接引用计数 atomic_t refcnt; /* reference count */// 定时器 struct timer_list timer; /* Expiration timer */// 超时时间 volatile unsigned long timeout; /* timeout */ /* Flags and state transition */// 状态转换锁 spinlock_t lock; /* lock for state transition */ volatile __u16 flags; /* status flags */ volatile __u16 state; /* state info */ /* Control members */// 主连接, 如FTP struct ip_vs_conn *control; /* Master control connection */// 子连接数 atomic_t n_control; /* Number of controlled ones */// 真正服务器 struct ip_vs_dest *dest; /* real server */// 进入的数据统计 atomic_t in_pkts; /* incoming packet counter */ /* packet transmitter for different forwarding methods. If it mangles the packet, it must return NF_DROP or better NF_STOLEN, otherwise this must be changed to a sk_buff **. */// 数据包发送 int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); /* Note: we can group the following members into a structure, in order to save more space, and the following members are only used in VS/NAT anyway */// IPVS应用 struct ip_vs_app *app; /* bound ip_vs_app object */// 应用的私有数据 void *app_data; /* Application private data */// 进入数据的序列号 struct ip_vs_seq in_seq; /* incoming seq. struct */// 发出数据的序列号 struct ip_vs_seq out_seq; /* outgoing seq. struct */};3.3 IPVS服务这个结构用来描述IPVS对外的虚拟服务器信息。/* * The information about the virtual service offered to the net * and the forwarding entries */struct ip_vs_service {// 按普通协议,地址,端口进行HASH的链表 struct list_head s_list; /* for normal service table */// 按nfmark进行HASH的链表(感觉没必要) struct list_head f_list; /* for fwmark-based service table */// 引用计数 atomic_t refcnt; /* reference counter */// 使用计数 atomic_t usecnt; /* use counter */// 协议 __u16 protocol; /* which protocol (TCP/UDP) */// 虚拟服务器地址 __u32 addr; /* IP address for virtual service */// 虚拟端口 __u16 port; /* port number for the service */// 就是skb中的nfmark __u32 fwmark; /* firewall mark of the service */// 标志 unsigned flags; /* service status flags */// 超时 unsigned timeout; /* persistent timeout in ticks */// 网络掩码 __u32 netmask; /* grouping granularity */// 真实服务器的地址链表 struct list_head destinations; /* real server d-linked list */// 真实服务器的数量 __u32 num_dests; /* number of servers */// 服务统计信息 struct ip_vs_stats stats; /* statistics for the service */// 应用 struct ip_vs_app *inc; /* bind conns to this app inc */ /* for scheduling */// 调度指针 struct ip_vs_scheduler *scheduler; /* bound scheduler object */ rwlock_t sched_lock; /* lock sched_data */ void *sched_data; /* scheduler application data */};3.4 IPVS目的服务器这个结构用来描述具体的真实服务器的信息/* * The real server destination forwarding entry * with ip address, port number, and so on. */struct ip_vs_dest {// struct list_head n_list; /* for the dests in the service */ struct list_head d_list; /* for table with all the dests */// 服务器地址 __u32 addr; /* IP address of the server */// 服务器端口 __u16 port; /* port number of the server */// 目标标志,易变参数 volatile unsigned flags; /* dest status flags */// 连接标志 atomic_t conn_flags; /* flags to copy to conn */// 服务器权重 atomic_t weight; /* server weight */// 引用次数 atomic_t refcnt; /* reference counter */// 统计数 struct ip_vs_stats stats; /* statistics */ /* connection counters and thresholds */// 活动的连接 atomic_t activeconns; /* active connections */// 不活动的连接 atomic_t inactconns; /* inactive connections */// 保持的连接 atomic_t persistconns; /* persistent connections */// 连接上限 __u32 u_threshold; /* upper threshold */// 连接下限 __u32 l_threshold; /* lower threshold */ /* for destination cache */ spinlock_t dst_lock; /* lock of dst_cache */ struct dst_entry *dst_cache; /* destination cache entry */ u32 dst_rtos; /* RT_TOS(tos) for dst */ /* for virtual service */ struct ip_vs_service *svc; /* service it belongs to */ __u16 protocol; /* which protocol (TCP/UDP) */ __u32 vaddr; /* virtual IP address */ __u16 vport; /* virtual port number */ __u32 vfwmark; /* firewall mark of service */};3.5 IPVS调度器这个结构用来描述IPVS调度算法,目前调度方法包括rr,wrr,lc, wlc, lblc, lblcr, dh, sh等/* * The scheduler object */struct ip_vs_scheduler { struct list_head n_list; /* d-linked list head */ char *name; /* scheduler name */ atomic_t refcnt; /* reference counter */ struct module *module; /* THIS_MODULE/NULL */ /* scheduler initializing service */ int (*init_service)(struct ip_vs_service *svc); /* scheduling service finish */ int (*done_service)(struct ip_vs_service *svc); /* scheduler updating service */ int (*update_service)(struct ip_vs_service *svc); /* selecting a server from the given service */ struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, const struct sk_buff *skb);};3.6 IPVS应用IPVS应用是针对多连接协议的, 目前也就只支持FTP。由于ip_vs_app.c是从2.2过来的,没有管内核是否本身有NAT的情况,所以相当于自身实现了应用协议的NAT处理,包括内容信息的改变,TCP序列号确认号的调整等,而现在这些都由netfilter实现了,IPVS可以不用管这些,只处理连接调度就行了。IPVS的应用模块化还不是很好,在处理连接端口时,还要判断是否是FTPPORT,也就是说不支持其他多连接协议的,应该象netfilter一样为每个多连接协议设置一个helper,自动调用,不用在程序里判断端口。/* * The application module object (a.k.a. app incarnation) */struct ip_vs_app{// 用来挂接到应用链表 struct list_head a_list; /* member in app list */ int type; /* IP_VS_APP_TYPE_xxx */ char *name; /* application module name */// 协议, TCP, UDP... __u16 protocol;// 模块本身 struct module *module; /* THIS_MODULE/NULL */// 应用的具体实例链表 struct list_head incs_list; /* list of incarnations */ /* members for application incarnations */// 将应用结构挂接到对应协议(TCP, UDP...)的应用表 struct list_head p_list; /* member in proto app list */ struct ip_vs_app *app; /* its real application */ __u16 port; /* port number in net order */ atomic_t usecnt; /* usage counter */ /* output hook: return false if can't linearize. diff set for TCP. */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff **, int *diff); /* input hook: return false if can't linearize. diff set for TCP. */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff **, int *diff); /* ip_vs_app initializer */ int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); /* ip_vs_app finish */ int (*done_conn)(struct ip_vs_app *, struct ip_vs_conn *); /* not used now */ int (*bind_conn)(struct ip_vs_app *, struct ip_vs_conn *, struct ip_vs_protocol *); void (*unbind_conn)(struct ip_vs_app *, struct ip_vs_conn *); int * timeout_table; int * timeouts; int timeouts_size; int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app, int *verdict, struct ip_vs_conn **cpp); struct ip_vs_conn * (*conn_in_get)(const struct sk_buff *skb, struct ip_vs_app *app, const struct iphdr *iph, unsigned int proto_off, int inverse); struct ip_vs_conn * (*conn_out_get)(const struct sk_buff *skb, struct ip_vs_app *app, const struct iphdr *iph, unsigned int proto_off, int inverse); int (*state_transition)(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_app *app); void (*timeout_change)(struct ip_vs_app *app, int flags);}; 3.7 用户空间数据结构用户空间信息是ipvsadm程序接收用户输入后传递给内核ipvs的信息,信息都是很直接的,没有各种控制信息。ipvsadm和ipvs的关系相当于iptables和netfilter的关系。3.7.1 用户空间的虚拟服务信息/* * The struct ip_vs_service_user and struct ip_vs_dest_user are * used to set IPVS rules through setsockopt. */struct ip_vs_service_user { /* virtual service addresses */ u_int16_t protocol; u_int32_t addr; /* virtual ip address */ u_int16_t port; u_int32_t fwmark; /* firwall mark of service */ /* virtual service options */ char sched_name[IP_VS_SCHEDNAME_MAXLEN]; unsigned flags; /* virtual service flags */ unsigned timeout; /* persistent timeout in sec */ u_int32_t netmask; /* persistent netmask */};3.7.2 用户空间的真实服务器信息struct ip_vs_dest_user { /* destination server address */ u_int32_t addr; u_int16_t port; /* real server options */ unsigned conn_flags; /* connection flags */ int weight; /* destination weight */ /* thresholds for active connections */ u_int32_t u_threshold; /* upper threshold */ u_int32_t l_threshold; /* lower threshold */};3.7.3 用户空间的统计信息/* * IPVS statistics object (for user space) */struct ip_vs_stats_user{ __u32 conns; /* connections scheduled */ __u32 inpkts; /* incoming packets */ __u32 outpkts; /* outgoing packets */ __u64 inbytes; /* incoming bytes */ __u64 outbytes; /* outgoing bytes */ __u32 cps; /* current connection rate */ __u32 inpps; /* current in packet rate */ __u32 outpps; /* current out packet rate */ __u32 inbps; /* current in byte rate */ __u32 outbps; /* current out byte rate */};3.7.4 用户空间的获取信息结构/* The argument to IP_VS_SO_GET_INFO */struct ip_vs_getinfo { /* version number */ unsigned int version; /* size of connection hash table */ unsigned int size; /* number of virtual services */ unsigned int num_services;};3.7.5 用户空间的服务规则项信息/* The argument to IP_VS_SO_GET_SERVICE */struct ip_vs_service_entry { /* which service: user fills in these */ u_int16_t protocol; u_int32_t addr; /* virtual address */ u_int16_t port; u_int32_t fwmark; /* firwall mark of service */ /* service options */ char sched_name[IP_VS_SCHEDNAME_MAXLEN]; unsigned flags; /* virtual service flags */ unsigned timeout; /* persistent timeout */ u_int32_t netmask; /* persistent netmask */ /* number of real servers */ unsigned int num_dests; /* statistics */ struct ip_vs_stats_user stats;};3.7.6 用户空间的服务器项信息struct ip_vs_dest_entry { u_int32_t addr; /* destination address */ u_int16_t port; unsigned conn_flags; /* connection flags */ int weight; /* destination weight */ u_int32_t u_threshold; /* upper threshold */ u_int32_t l_threshold; /* lower threshold */ u_int32_t activeconns; /* active connections */ u_int32_t inactconns; /* inactive connections */ u_int32_t persistconns; /* persistent connections */ /* statistics */ struct ip_vs_stats_user stats;};3.7.7 用户空间的获取服务器项信息/* The argument to IP_VS_SO_GET_DESTS */struct ip_vs_get_dests { /* which service: user fills in these */ u_int16_t protocol; u_int32_t addr; /* virtual address */ u_int16_t port; u_int32_t fwmark; /* firwall mark of service */ /* number of real servers */ unsigned int num_dests; /* the real servers */ struct ip_vs_dest_entry entrytable[0];};3.7.8 用户空间的获取虚拟服务项信息/* The argument to IP_VS_SO_GET_SERVICES */struct ip_vs_get_services { /* number of virtual services */ unsigned int num_services; /* service table */ struct ip_vs_service_entry entrytable[0];};3.7.9 用户空间的获取超时信息结构/* The argument to IP_VS_SO_GET_TIMEOUT */struct ip_vs_timeout_user { int tcp_timeout; int tcp_fin_timeout; int udp_timeout;};3.7.10 用户空间的获取IPVS内核守护进程信息结构/* The argument to IP_VS_SO_GET_DAEMON */struct ip_vs_daemon_user { /* sync daemon state (master/backup) */ int state; /* multicast interface name */ char mcast_ifn[IP_VS_IFNAME_MAXLEN]; /* SyncID we belong to */ int syncid;};......待续......