读书人

ip_vs兑现分析(1)

发布时间: 2012-09-21 15:47:26 作者: rapoo

ip_vs实现分析(1)
本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源:http://yfydz.cublog.cn

1. 前言ipvs是章文嵩先生主持的一个开源项目,早在2.2内核时就已经以内核补丁形式出现,RedHat6.1和6.0的一个重要区别就是增加了 IPVS。从2.4.24后IPVS已经成为Linux官方标准内核的一部分,2.2时IPVS是完全独立的部分,2.4以后借用了netfilter的一些处理机制,但主体还是比较独立,但功能和netfilter有重复的地方。IPVS官方网站为: http://www.linuxvirtualserver.org以下内核代码版本2.6.17.11, ipvs版本为1.2.1。2. IPVS的外部表现根据LVS官方网站的介绍,LVS支持三种负载均衡模式:NAT,tunnel和direct routing(DR)。NAT是通用模式,所有交互数据必须通过均衡器;后两种则是一种半连接处理方式,请求数据通过均衡器,而服务器的回应则是直接路由返回的,而这两种方法的区别是tunnel模式下由于进行了IP封装所以可路由,而DR方式是修改MAC地址来实现,所以必须同一网段。3. 几个重要结构3.1 协议这个结构用来描述IPVS支持的IP协议。IPVS的IP层协议支持TCP, UDP, AH和ESP这4种IP层协议struct ip_vs_protocol {// 链表中的下一项 struct ip_vs_protocol *next;// 协议名称, "TCP", "UDP"... char   *name;// 协议值: 6, 17, ... __u16   protocol;// 不进行分配 int   dont_defrag;// 协议应用计数器,也据是该协议的中多连接协议的数量 atomic_t  appcnt;  /* counter of proto app incs */// 协议各状态的超时数组 int   *timeout_table; /* protocol timeout table */// 协议初始化 void (*init)(struct ip_vs_protocol *pp);// 协议释放 void (*exit)(struct ip_vs_protocol *pp);// 协议调度 int (*conn_schedule)(struct sk_buff *skb,        struct ip_vs_protocol *pp,        int *verdict, struct ip_vs_conn **cpp);// 查找in方向的IPVS连接 struct ip_vs_conn * (*conn_in_get)(const struct sk_buff *skb,         struct ip_vs_protocol *pp,         const struct iphdr *iph,         unsigned int proto_off,         int inverse);// 查找out方向的IPVS连接 struct ip_vs_conn * (*conn_out_get)(const struct sk_buff *skb,   struct ip_vs_protocol *pp,   const struct iphdr *iph,   unsigned int proto_off,   int inverse);// 源NAT操作 int (*snat_handler)(struct sk_buff **pskb,       struct ip_vs_protocol *pp, struct ip_vs_conn *cp);// 目的NAT操作 int (*dnat_handler)(struct sk_buff **pskb,       struct ip_vs_protocol *pp, struct ip_vs_conn *cp);// 协议校验和计算 int (*csum_check)(struct sk_buff *skb, struct ip_vs_protocol *pp);// 当前协议状态名称: 如"LISTEN", "ESTABLISH"... const char *(*state_name)(int state);// 协议状态迁移 int (*state_transition)(struct ip_vs_conn *cp, int direction,    const struct sk_buff *skb,    struct ip_vs_protocol *pp);// 登记应用 int (*register_app)(struct ip_vs_app *inc);// 去除应用登记 void (*unregister_app)(struct ip_vs_app *inc); int (*app_conn_bind)(struct ip_vs_conn *cp);// 数据包打印 void (*debug_packet)(struct ip_vs_protocol *pp,        const struct sk_buff *skb,        int offset,        const char *msg);// 调整超时 void (*timeout_change)(struct ip_vs_protocol *pp, int flags);// 设置各种状态下的协议超时 int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to);};3.2 IPVS连接这个结构用来描述IPVS的连接。IPVS的连接和netfilter定义的连接类似/* * IP_VS structure allocated for each dynamically scheduled connection */struct ip_vs_conn {// HASH链表 struct list_head        c_list;         /* hashed list heads */ /* Protocol, addresses and port numbers */// 客户机地址 __u32                   caddr;          /* client address */// 服务器对外的虚拟地址 __u32                   vaddr;          /* virtual address */// 服务器实际地址 __u32                   daddr;          /* destination address */// 客户端的端口 __u16                   cport;// 服务器对外虚拟端口 __u16                   vport;// 服务器实际端口 __u16                   dport;// 协议类型 __u16                   protocol;       /* Which protocol (TCP/UDP) */ /* counter and timer */// 连接引用计数 atomic_t  refcnt;  /* reference count */// 定时器 struct timer_list timer;  /* Expiration timer */// 超时时间 volatile unsigned long timeout; /* timeout */ /* Flags and state transition */// 状态转换锁 spinlock_t              lock;           /* lock for state transition */ volatile __u16          flags;          /* status flags */ volatile __u16          state;          /* state info */ /* Control members */// 主连接, 如FTP struct ip_vs_conn       *control;       /* Master control connection */// 子连接数 atomic_t                n_control;      /* Number of controlled ones */// 真正服务器 struct ip_vs_dest       *dest;          /* real server */// 进入的数据统计 atomic_t                in_pkts;        /* incoming packet counter */ /* packet transmitter for different forwarding methods.  If it    mangles the packet, it must return NF_DROP or better NF_STOLEN,    otherwise this must be changed to a sk_buff **.  */// 数据包发送 int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,      struct ip_vs_protocol *pp); /* Note: we can group the following members into a structure,    in order to save more space, and the following members are    only used in VS/NAT anyway */// IPVS应用 struct ip_vs_app        *app;           /* bound ip_vs_app object */// 应用的私有数据 void                    *app_data;      /* Application private data */// 进入数据的序列号 struct ip_vs_seq        in_seq;         /* incoming seq. struct */// 发出数据的序列号 struct ip_vs_seq        out_seq;        /* outgoing seq. struct */};3.3 IPVS服务这个结构用来描述IPVS对外的虚拟服务器信息。/* * The information about the virtual service offered to the net * and the forwarding entries */struct ip_vs_service {// 按普通协议,地址,端口进行HASH的链表 struct list_head s_list;   /* for normal service table */// 按nfmark进行HASH的链表(感觉没必要) struct list_head f_list;   /* for fwmark-based service table */// 引用计数 atomic_t  refcnt;   /* reference counter */// 使用计数 atomic_t  usecnt;   /* use counter */// 协议 __u16   protocol; /* which protocol (TCP/UDP) */// 虚拟服务器地址 __u32   addr;   /* IP address for virtual service */// 虚拟端口 __u16   port;   /* port number for the service */// 就是skb中的nfmark __u32                   fwmark;   /* firewall mark of the service */// 标志 unsigned  flags;   /* service status flags */// 超时 unsigned  timeout;  /* persistent timeout in ticks */// 网络掩码 __u32   netmask;  /* grouping granularity */// 真实服务器的地址链表 struct list_head destinations;  /* real server d-linked list */// 真实服务器的数量 __u32   num_dests;     /* number of servers */// 服务统计信息 struct ip_vs_stats      stats;         /* statistics for the service */// 应用 struct ip_vs_app *inc;   /* bind conns to this app inc */ /* for scheduling */// 调度指针 struct ip_vs_scheduler *scheduler;    /* bound scheduler object */ rwlock_t  sched_lock;    /* lock sched_data */ void   *sched_data;   /* scheduler application data */};3.4 IPVS目的服务器这个结构用来描述具体的真实服务器的信息/* * The real server destination forwarding entry * with ip address, port number, and so on. */struct ip_vs_dest {// struct list_head n_list;   /* for the dests in the service */ struct list_head d_list;   /* for table with all the dests */// 服务器地址 __u32   addr;  /* IP address of the server */// 服务器端口 __u16   port;  /* port number of the server */// 目标标志,易变参数 volatile unsigned flags;  /* dest status flags */// 连接标志 atomic_t  conn_flags; /* flags to copy to conn */// 服务器权重 atomic_t  weight;  /* server weight */// 引用次数 atomic_t  refcnt;  /* reference counter */// 统计数 struct ip_vs_stats      stats;          /* statistics */ /* connection counters and thresholds */// 活动的连接 atomic_t  activeconns; /* active connections */// 不活动的连接 atomic_t  inactconns; /* inactive connections */// 保持的连接 atomic_t  persistconns; /* persistent connections */// 连接上限 __u32   u_threshold; /* upper threshold */// 连接下限 __u32   l_threshold; /* lower threshold */ /* for destination cache */ spinlock_t  dst_lock; /* lock of dst_cache */ struct dst_entry *dst_cache; /* destination cache entry */ u32   dst_rtos; /* RT_TOS(tos) for dst */ /* for virtual service */ struct ip_vs_service *svc;  /* service it belongs to */ __u16   protocol; /* which protocol (TCP/UDP) */ __u32   vaddr;  /* virtual IP address */ __u16   vport;  /* virtual port number */ __u32   vfwmark; /* firewall mark of service */};3.5 IPVS调度器这个结构用来描述IPVS调度算法,目前调度方法包括rr,wrr,lc, wlc, lblc, lblcr, dh, sh等/* * The scheduler object */struct ip_vs_scheduler { struct list_head n_list;  /* d-linked list head */ char   *name;  /* scheduler name */ atomic_t  refcnt;  /* reference counter */ struct module  *module; /* THIS_MODULE/NULL */ /* scheduler initializing service */ int (*init_service)(struct ip_vs_service *svc); /* scheduling service finish */ int (*done_service)(struct ip_vs_service *svc); /* scheduler updating service */ int (*update_service)(struct ip_vs_service *svc); /* selecting a server from the given service */ struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,           const struct sk_buff *skb);};3.6 IPVS应用IPVS应用是针对多连接协议的, 目前也就只支持FTP。由于ip_vs_app.c是从2.2过来的,没有管内核是否本身有NAT的情况,所以相当于自身实现了应用协议的NAT处理,包括内容信息的改变,TCP序列号确认号的调整等,而现在这些都由netfilter实现了,IPVS可以不用管这些,只处理连接调度就行了。IPVS的应用模块化还不是很好,在处理连接端口时,还要判断是否是FTPPORT,也就是说不支持其他多连接协议的,应该象netfilter一样为每个多连接协议设置一个helper,自动调用,不用在程序里判断端口。/* * The application module object (a.k.a. app incarnation) */struct ip_vs_app{// 用来挂接到应用链表 struct list_head a_list;  /* member in app list */ int   type;  /* IP_VS_APP_TYPE_xxx */ char   *name;  /* application module name */// 协议, TCP, UDP... __u16   protocol;// 模块本身 struct module  *module; /* THIS_MODULE/NULL */// 应用的具体实例链表 struct list_head incs_list; /* list of incarnations */ /* members for application incarnations */// 将应用结构挂接到对应协议(TCP, UDP...)的应用表 struct list_head p_list;  /* member in proto app list */ struct ip_vs_app *app;  /* its real application */ __u16   port;  /* port number in net order */ atomic_t  usecnt;  /* usage counter */ /* output hook: return false if can't linearize. diff set for TCP.  */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,         struct sk_buff **, int *diff); /* input hook: return false if can't linearize. diff set for TCP. */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,        struct sk_buff **, int *diff); /* ip_vs_app initializer */ int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); /* ip_vs_app finish */ int (*done_conn)(struct ip_vs_app *, struct ip_vs_conn *); /* not used now */ int (*bind_conn)(struct ip_vs_app *, struct ip_vs_conn *,    struct ip_vs_protocol *); void (*unbind_conn)(struct ip_vs_app *, struct ip_vs_conn *); int *   timeout_table; int *   timeouts; int   timeouts_size; int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app,        int *verdict, struct ip_vs_conn **cpp); struct ip_vs_conn * (*conn_in_get)(const struct sk_buff *skb, struct ip_vs_app *app,         const struct iphdr *iph, unsigned int proto_off,         int inverse); struct ip_vs_conn * (*conn_out_get)(const struct sk_buff *skb, struct ip_vs_app *app,   const struct iphdr *iph, unsigned int proto_off,   int inverse); int (*state_transition)(struct ip_vs_conn *cp, int direction,    const struct sk_buff *skb,    struct ip_vs_app *app); void (*timeout_change)(struct ip_vs_app *app, int flags);}; 3.7 用户空间数据结构用户空间信息是ipvsadm程序接收用户输入后传递给内核ipvs的信息,信息都是很直接的,没有各种控制信息。ipvsadm和ipvs的关系相当于iptables和netfilter的关系。3.7.1 用户空间的虚拟服务信息/* * The struct ip_vs_service_user and struct ip_vs_dest_user are * used to set IPVS rules through setsockopt. */struct ip_vs_service_user { /* virtual service addresses */ u_int16_t  protocol; u_int32_t  addr;  /* virtual ip address */ u_int16_t  port; u_int32_t  fwmark;  /* firwall mark of service */ /* virtual service options */ char   sched_name[IP_VS_SCHEDNAME_MAXLEN]; unsigned  flags;  /* virtual service flags */ unsigned  timeout; /* persistent timeout in sec */ u_int32_t  netmask; /* persistent netmask */};3.7.2 用户空间的真实服务器信息struct ip_vs_dest_user { /* destination server address */ u_int32_t  addr; u_int16_t  port; /* real server options */ unsigned  conn_flags; /* connection flags */ int   weight;  /* destination weight */ /* thresholds for active connections */ u_int32_t  u_threshold; /* upper threshold */ u_int32_t  l_threshold; /* lower threshold */};3.7.3 用户空间的统计信息/* * IPVS statistics object (for user space) */struct ip_vs_stats_user{ __u32                   conns;          /* connections scheduled */ __u32                   inpkts;         /* incoming packets */ __u32                   outpkts;        /* outgoing packets */ __u64                   inbytes;        /* incoming bytes */ __u64                   outbytes;       /* outgoing bytes */ __u32   cps;  /* current connection rate */ __u32   inpps;  /* current in packet rate */ __u32   outpps;  /* current out packet rate */ __u32   inbps;  /* current in byte rate */ __u32   outbps;  /* current out byte rate */};3.7.4 用户空间的获取信息结构/* The argument to IP_VS_SO_GET_INFO */struct ip_vs_getinfo { /* version number */ unsigned int  version; /* size of connection hash table */ unsigned int  size; /* number of virtual services */ unsigned int  num_services;};3.7.5 用户空间的服务规则项信息/* The argument to IP_VS_SO_GET_SERVICE */struct ip_vs_service_entry { /* which service: user fills in these */ u_int16_t  protocol; u_int32_t  addr;  /* virtual address */ u_int16_t  port; u_int32_t  fwmark;  /* firwall mark of service */ /* service options */ char   sched_name[IP_VS_SCHEDNAME_MAXLEN]; unsigned  flags;          /* virtual service flags */ unsigned  timeout; /* persistent timeout */ u_int32_t  netmask; /* persistent netmask */ /* number of real servers */ unsigned int  num_dests; /* statistics */ struct ip_vs_stats_user stats;};3.7.6 用户空间的服务器项信息struct ip_vs_dest_entry { u_int32_t  addr;  /* destination address */ u_int16_t  port; unsigned  conn_flags; /* connection flags */ int   weight;  /* destination weight */ u_int32_t  u_threshold; /* upper threshold */ u_int32_t  l_threshold; /* lower threshold */ u_int32_t  activeconns; /* active connections */ u_int32_t  inactconns; /* inactive connections */ u_int32_t  persistconns; /* persistent connections */ /* statistics */ struct ip_vs_stats_user stats;};3.7.7 用户空间的获取服务器项信息/* The argument to IP_VS_SO_GET_DESTS */struct ip_vs_get_dests { /* which service: user fills in these */ u_int16_t  protocol; u_int32_t  addr;  /* virtual address */ u_int16_t  port; u_int32_t  fwmark;  /* firwall mark of service */ /* number of real servers */ unsigned int  num_dests; /* the real servers */ struct ip_vs_dest_entry entrytable[0];};3.7.8 用户空间的获取虚拟服务项信息/* The argument to IP_VS_SO_GET_SERVICES */struct ip_vs_get_services { /* number of virtual services */ unsigned int  num_services; /* service table */ struct ip_vs_service_entry entrytable[0];};3.7.9 用户空间的获取超时信息结构/* The argument to IP_VS_SO_GET_TIMEOUT */struct ip_vs_timeout_user { int   tcp_timeout; int   tcp_fin_timeout; int   udp_timeout;};3.7.10 用户空间的获取IPVS内核守护进程信息结构/* The argument to IP_VS_SO_GET_DAEMON */struct ip_vs_daemon_user { /* sync daemon state (master/backup) */ int   state; /* multicast interface name */ char   mcast_ifn[IP_VS_IFNAME_MAXLEN]; /* SyncID we belong to */ int   syncid;};......待续......

读书人网 >VSTS

热点推荐