ipvs的数据报处理
ipvs的最基本原理就是数据报的重定向传送.这个部分用如下拓扑图来分析ipvs中关于此部分的过程
.
________
| |
| client |
|________|
CIP=192.168.1.254
|
(router)
|
__________ |
| | | VIP=192.168.1.110 (eth0:110)
| director |---|
|__________| | DIP=10.1.1.9 (eth0:9)
|
|
-----------------------------------
| | |
| | |
RIP1=10.1.1.2 RIP2=10.1.1.3 RIP3=10.1.1.4 (all eth0)
_____________ _____________ _____________
| | | | | |
| realserver | | realserver | | realserver |
|_____________| |_____________| |_____________|
(一)注册钩子函数
nf_register_hooks() -> nf_register_hook() 将ip_vs_ops[]里的4个hook添加到了nf_hooks[PF_INET][HOOK_NAME]内.每当有数据经过相应的规则链时,就会调用相应的函数对数据报进行处理.
(二)注册ipvs支持的协议
register_ip_vs_protocol()将能支持的协议加入到ip_vs_proto_table[]中.ip_vs_protocol类型的结构里,包含的对该协议的参数和处理的函数指针.比如tcp协议看上去是这个样子
net/ipv4/ipvs/ip_vs_proto_tcp.c- struct ip_vs_protocol ip_vs_protocol_tcp = {
- .name = "TCP",
- .protocol = IPPROTO_TCP,
- .num_states = IP_VS_TCP_S_LAST,
- .dont_defrag = 0,
- .appcnt = ATOMIC_INIT(0),
- .init = ip_vs_tcp_init,
- .exit = ip_vs_tcp_exit,
- .register_app = tcp_register_app,
- .unregister_app = tcp_unregister_app,
- .conn_schedule = tcp_conn_schedule,
- .conn_in_get = tcp_conn_in_get, //
- .conn_out_get = tcp_conn_out_get, //
- .snat_handler = tcp_snat_handler,
- .dnat_handler = tcp_dnat_handler,
- .csum_check = tcp_csum_check, //
- .state_name = tcp_state_name, //
- .state_transition = tcp_state_transition, //
- .app_conn_bind = tcp_app_conn_bind, //
- .debug_packet = ip_vs_tcpudp_debug_packet, //
- .timeout_change = tcp_timeout_change, //
- .set_state_timeout = tcp_set_state_timeout, //
- };
|
(三)添加虚拟服务器
举个具体的例子吧,如果对于NF_INET_LOCAL_IN规则链上的ip_vs_in()而言,他会怎么去工作呢?答案是如果没有规则,那么这个函数将什么都不会做
接下来分析的重点就是如何添加规则,让ip_vs_in()这个函数知道怎么处理特定的数据.
要想添加规则,首先要做的就是在用户空间添加一个vs(虚拟主机)到内核空间.内容是添加一个虚拟服务器,协议为tcp,虚拟服务器地址是192.168.1.110:80,使用的调度方法是轮叫调度Round-Robin Scheduling
ipvsadm -A -t 192.168.1.110:80 -s rr
虚拟服务信息(svc)将存放在ip_vs_svc_table[],如果你的规则用到了防火墙标记,那么将会存放在ip_vs_svc_fwm_table[]内,在内核空间,如下代码会被执行
net/ipv4/ipvs/ip_vs_ctl.c-
- * Add a service into the service hash table
-
- static int
- ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
- {
- int ret = 0;
- struct ip_vs_scheduler *sched = NULL;
- struct ip_vs_service *svc = NULL;
-
-
- ip_vs_use_count_inc();
-
-
- sched = ip_vs_scheduler_get(u->sched_name);
- if (sched == NULL) {
- IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
- u->sched_name);
- ret = -ENOENT;
- goto out_mod_dec;
- }
-
- svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
- if (svc == NULL) {
- IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
- ret = -ENOMEM;
- goto out_err;
- }
-
-
- atomic_set(&svc->usecnt, 1);
- atomic_set(&svc->refcnt, 0);
-
- svc->protocol = u->protocol;
- svc->addr = u->addr;
- svc->port = u->port;
- svc->fwmark = u->fwmark;
- svc->flags = u->flags;
- svc->timeout = u->timeout * HZ;
- svc->netmask = u->netmask;
-
- INIT_LIST_HEAD(&svc->destinations);
- rwlock_init(&svc->sched_lock);
- spin_lock_init(&svc->stats.lock);
-
-
- ret = ip_vs_bind_scheduler(svc, sched);
- if (ret)
- goto out_err;
- sched = NULL;
-
-
- if (svc->port == FTPPORT)
- atomic_inc(&ip_vs_ftpsvc_counter);
- else if (svc->port == 0)
- atomic_inc(&ip_vs_nullsvc_counter);
-
- ip_vs_new_estimator(&svc->stats);
- ip_vs_num_services++;
-
-
- write_lock_bh(&__ip_vs_svc_lock);
- ip_vs_svc_hash(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
-
- *svc_p = svc;
- return 0;
-
- out_err:
- if (svc != NULL) {
- if (svc->scheduler)
- ip_vs_unbind_scheduler(svc);
- if (svc->inc) {
- local_bh_disable();
- ip_vs_app_inc_put(svc->inc);
- local_bh_enable();
- }
- kfree(svc);
- }
- ip_vs_scheduler_put(sched);
-
- out_mod_dec:
-
- ip_vs_use_count_dec();
-
- return ret;
- }
|
(四)添加真实服务器(rs)
然后添加几个真实服务器(rs)到内核空间.内容是往tcp协议,地址为192.168.1.110:80的虚拟服务器中添加3个真实服务器,分别是10.1.1.2:80,10.1.1.3:80,10.1.1.4:80,都是使用NAT方式
ipvsadm -a -t 192.168.1.110:80 -r 10.1.1.2:80 -m
ipvsadm -a -t 192.168.1.110:80 -r 10.1.1.3:80 -m
ipvsadm -a -t 192.168.1.110:80 -r 10.1.1.4:80 -m
以下代码会在内核空间执行,将真实服务器信息添加到ip_vs_rtable[]和svc->destinations中,然后执行svc->scheduler->update_service(svc)
net/ipv4/ipvs/ip_vs_ctl.c-
- * Add a destination into an existing service
-
- static int
- ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
- {
- struct ip_vs_dest *dest;
- __be32 daddr = udest->addr;
- __be16 dport = udest->port;
- int ret;
-
- EnterFunction(2);
-
- if (udest->weight < 0) {
- IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
- return -ERANGE;
- }
-
- if (udest->l_threshold > udest->u_threshold) {
- IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
- "upper threshold\n");
- return -ERANGE;
- }
-
-
- * Check if the dest already exists in the list
-
- dest = ip_vs_lookup_dest(svc, daddr, dport);
- if (dest != NULL) {
- IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
- return -EEXIST;
- }
-
-
- * Check if the dest already exists in the trash and
- * is from the same service
-
- dest = ip_vs_trash_get_dest(svc, daddr, dport);
- if (dest != NULL) {
- IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
- "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
- NIPQUAD(daddr), ntohs(dport),
- atomic_read(&dest->refcnt),
- dest->vfwmark,
- NIPQUAD(dest->vaddr),
- ntohs(dest->vport));
- __ip_vs_update_dest(svc, dest, udest);
-
-
- * Get the destination from the trash
-
- list_del(&dest->n_list);
-
- ip_vs_new_estimator(&dest->stats);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
-
- * Wait until all other svc users go away.
-
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- list_add(&dest->n_list, &svc->destinations);
- svc->num_dests++;
-
-
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
- return 0;
- }
-
-
- * Allocate and initialize the dest structure
-
- ret = ip_vs_new_dest(svc, udest, &dest);
- if (ret) {
- return ret;
- }
-
-
- * Add the dest entry into the list
-
- atomic_inc(&dest->refcnt);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
-
- * Wait until all other svc users go away.
-
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- list_add(&dest->n_list, &svc->destinations);
- svc->num_dests++;
-
-
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
- LeaveFunction(2);
-
- return 0;
- }
|
(五)当虚拟服务器被访问
<5.1> rs的选择
要做的事情就这么些了,现在就等数据经过相应的规则链了.主要是input的时候,回来的数据只有nat方式下才需要修改,而且比较简单,所以数据返回的情况就不分析了.在我们的例子中,当client访问vs,skb经过INPUT时ip_vs_in会被调用,在ip_vs_in中对skb的关键代码为:
net/ipv4/ipvs/ip_vs_core.c- if (unlikely(!cp)) {
- int v;
-
- if (!pp->conn_schedule(skb, pp, &v, &cp))
- return v;
- }
|
对于tcp协议来说,pp->conn_schedule()调用的就是tcp_conn_schedule().
net/ipv4/ipvs/ip_vs_proto_tcp.c- static int
- tcp_conn_schedule(struct sk_buff *skb,
- struct ip_vs_protocol *pp,
- int *verdict, struct ip_vs_conn **cpp)
- {
- struct ip_vs_service *svc;
- struct tcphdr _tcph, *th;
-
- th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
- if (th == NULL) {
- *verdict = NF_DROP;
- return 0;
- }
-
- if (th->syn &&
- (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol,
- ip_hdr(skb)->daddr, th->dest))) {
- if (ip_vs_todrop()) {
-
- * It seems that we are very loaded.
- * We have to drop this packet :(
-
- ip_vs_service_put(svc);
- *verdict = NF_DROP;
- return 0;
- }
-
-
- * Let the virtual server select a real server for the
- * incoming connection, and create a connection entry.
-
- *cpp = ip_vs_schedule(svc, skb);
- if (!*cpp) {
- *verdict = ip_vs_leave(svc, skb, pp);
- return 0;
- }
- ip_vs_service_put(svc);
- }
- return 1;
- }
|
对于我们的例子来说,svc->scheduler->schedule()将会调用ip_vs_rr_schedule()来进行rs的选择.具体的调度方法,请参考ipvs的调度
<5.2> skb的处理
rs选择好以后,只是修改了cp(kmem_cache_zalloc出来的)的目标地址并将cp加入到了ip_vs_conn_tab列表中,但是并没有修改skb.接下来还要将skb进行处理,使得skb能被转发到rs上去.在ip_vs_in函数中相关代码为
net/ipv4/ipvs/ip_vs_core.c- if (cp->packet_xmit)
- ret = cp->packet_xmit(skb, cp, pp);
-
|
5.2.1> ip_vs_nat_xmit
例子中3个rs都是nat方式的,所以毫无悬念的,会调用ip_vs_nat_xmit()进行处理了,一共有nat,tunnel,dr3种方式
net/ipv4/ipvs/ip_vs_xmit.c-
- * NAT transmitter (only for outside-to-inside nat forwarding)
- * Not used for related ICMP
-
- int
- ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
- {
- struct rtable *rt;
- int mtu;
- struct iphdr *iph = ip_hdr(skb);
-
- EnterFunction(10);
-
-
- if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
- __be16 _pt, *p;
- p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
- if (p == NULL)
- goto tx_error;
- ip_vs_conn_fill_cport(cp, *p);
- IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
- }
-
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
- goto tx_error_icmp;
-
-
- mtu = dst_mtu(&rt->u.dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
- ip_rt_put(rt);
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
- goto tx_error;
- }
-
-
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
- goto tx_error_put;
-
- if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
- goto tx_error_put;
-
-
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
-
-
- if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
-
- goto tx_error;
- ip_hdr(skb)->daddr = cp->daddr;
- ip_send_check(ip_hdr(skb));
-
- IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
-
-
- is larger than the MTU of outgoing device, there will be still
- MTU problem.
-
-
- skb->local_df = 1;
-
- IP_VS_XMIT(skb, rt);
-
- LeaveFunction(10);
- return NF_STOLEN;
-
- tx_error_icmp:
- dst_link_failure(skb);
- tx_error:
- LeaveFunction(10);
- kfree_skb(skb);
- return NF_STOLEN;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
- }
|
这段代码修改了skb的目的地址,目的端口.好像没有直接对链路层的数据进行修改?个人感觉应该是这个样子的,因为skb控制的数据报在被网络设备发出送之前,链路层的信息会被重新整理.比如源/目的mac地址由skb->dst来确定.也就是说这个数据报文被发往同网段的哪一台主机,由skb->dst决定
5.2.2> ip_vs_tunnel_xmit
net/ipv4/ipvs/ip_vs_xmit.c-
- * IP Tunneling transmitter
- *
- * This function encapsulates the packet in a new IP packet, its
- * destination will be set to cp->daddr. Most code of this function
- * is taken from ipip.c.
- *
- * It is used in VS/TUN cluster. The load balancer selects a real
- * server from a cluster based on a scheduling algorithm,
- * encapsulates the request packet and forwards it to the selected
- * server. For example, all real servers are configured with
- * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
- * the encapsulated packet, it will decapsulate the packet, processe
- * the request and return the response packets directly to the client
- * without passing the load balancer. This can greatly increase the
- * scalability of virtual server.
- *
- * Used for ANY protocol
-
- int
- ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
- {
- struct rtable *rt;
- struct net_device *tdev;
- struct iphdr *old_iph = ip_hdr(skb);
- u8 tos = old_iph->tos;
- __be16 df = old_iph->frag_off;
- sk_buff_data_t old_transport_header = skb->transport_header;
- struct iphdr *iph;
- unsigned int max_headroom;
- int mtu;
-
- EnterFunction(10);
-
- if (skb->protocol != htons(ETH_P_IP)) {
- IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
- "ETH_P_IP: %d, skb protocol: %d\n",
- htons(ETH_P_IP), skb->protocol);
- goto tx_error;
- }
-
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
- goto tx_error_icmp;
-
- tdev = rt->u.dst.dev;
-
- mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
- if (mtu < 68) {
- ip_rt_put(rt);
- IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
- goto tx_error;
- }
- if (skb->dst)
- skb->dst->ops->update_pmtu(skb->dst, mtu);
-
- df |= (old_iph->frag_off & htons(IP_DF));
-
- if ((old_iph->frag_off & htons(IP_DF))
- && mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
- goto tx_error;
- }
-
-
- * Okay, now see if we can stuff it in the buffer as-is.
-
- max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
-
- if (skb_headroom(skb) < max_headroom
- || skb_cloned(skb) || skb_shared(skb)) {
- struct sk_buff *new_skb =
- skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- kfree_skb(skb);
- IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
- return NF_STOLEN;
- }
- kfree_skb(skb);
- skb = new_skb;
- old_iph = ip_hdr(skb);
- }
-
- skb->transport_header = old_transport_header;
-
-
- ip_send_check(old_iph);
-
- skb_push(skb, sizeof(struct iphdr));
- skb_reset_network_header(skb);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
-
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
-
-
- * Push down and install the IPIP header.
-
- iph = ip_hdr(skb);
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr)>>2;
- iph->frag_off = df;
- iph->protocol = IPPROTO_IPIP;
- iph->tos = tos;
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
- iph->ttl = old_iph->ttl;
- ip_select_ident(iph, &rt->u.dst, NULL);
-
-
- skb->local_df = 1;
-
- ip_local_out(skb);
-
- LeaveFunction(10);
-
- return NF_STOLEN;
-
- tx_error_icmp:
- dst_link_failure(skb);
- tx_error:
- kfree_skb(skb);
- LeaveFunction(10);
- return NF_STOLEN;
- }
|
5.2.3> ip_vs_dr_xmit
net/ipv4/ipvs/ip_vs_xmit.c-
- * Direct Routing transmitter
- * Used for ANY protocol
-
- int
- ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
- {
- struct rtable *rt;
- struct iphdr *iph = ip_hdr(skb);
- int mtu;
-
- EnterFunction(10);
-
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
- goto tx_error_icmp;
-
-
- mtu = dst_mtu(&rt->u.dst);
- if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
- goto tx_error;
- }
-
-
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
-
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
- ip_send_check(ip_hdr(skb));
-
-
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
-
-
- skb->local_df = 1;
-
- IP_VS_XMIT(skb, rt);
-
- LeaveFunction(10);
- return NF_STOLEN;
-
- tx_error_icmp:
- dst_link_failure(skb);
- tx_error:
- kfree_skb(skb);
- LeaveFunction(10);
- return NF_STOLEN;
- }
|
这里有点特别的是skb中网络层的目的地址(vip)跟skb->dst的目的地址(rip)不一致.数据报的下一跳的mac地址由skb->dst来决定. 当rs收到这个mac地址属于自己,ip地址不属于自己的数据报时,如果不做处理的话,这个数据报将会被丢弃.所以还应该在rs上加上类似这样的命令,告诉rs将这样的数据报文接受下来
iptables -t nat -A PREROUTING -p tcp -d VIP --dport 80 -j REDIRECT