ipvs的调度
每当有新的数据报(skb)访问vs的时候,首先ip_vs_in()会在skb中提出与之对应的协议,并在ip_vs_proto_table[]里找出该协议的变量pp;
然后根据skb里的目标协议/地址/端口或者防火墙标记,在ip_vs_svc_table[]或者ip_vs_svc_fwm_table[]里找出虚拟服务的变量svc;最后调用svc->scheduler->schedule()得到dest(真实服务器),该函数是在添加虚拟服务的时候由ip_vs_add_service()定义的.
先来看看svc和rs
svc->destinations和rs->nlist构成一个双向循环链表结构
.
+----------------------------<----------------------->--------------------------+
| +------------------+ +---------------+ +---------------+ |
| |svc(ip_vs_service)| |rs1(ip_vs_dest)| |rs2(ip_vs_dest)| |
+<------> destinations <----------> nlist <-----------------> nlist <---------->+
| *scheduler | | addr | | addr |
| *sched_data | | port | | port |
| sched_lock | | flags | | flags |
| addr | | weight | | weight |
+---> | port | | *svc --------->-+ | *svc --------->---+
| | fwmark | | protocol | | | protocol | |
| | protocol | | vaddr | | | vaddr | |
| | ~ | | vport | | | vport | |
| | ~ | | vfwmark | V | vfwmark | V
| | ~ | | ~ | | | ~ | |
| +------------------+ +---------------+ | +---------------+ |
+--<------------------------<-----------------------+-<-------------------------+
svc->scheduler可能会指向的结构有ip_vs_dh_scheduler, ip_vs_lblc_scheduler, ip_vs_lblcr_scheduler, ip_vs_lc_scheduler, ip_vs_nq_scheduler, ip_vs_rr_scheduler, ip_vs_sed_scheduler, ip_vs_sh_scheduler, ip_vs_wlc_scheduler, ip_vs_wrr_scheduler这10种.代表了10种不同的调度方法
轮叫调度
Round-Robin Scheduling
我们先来看看rr
net/ipv4/ipvs/ip_vs_rr.c- static struct ip_vs_scheduler ip_vs_rr_scheduler = {
- .name = "rr",
- .refcnt = ATOMIC_INIT(0),
- .module = THIS_MODULE,
- .init_service = ip_vs_rr_init_svc,
- .done_service = ip_vs_rr_done_svc,
- .update_service = ip_vs_rr_update_svc,
- .schedule = ip_vs_rr_schedule,
- };
|
可以看出,核心的调度算法就是ip_vs_rr_schedule()
net/ipv4/ipvs/ip_vs_rr.c-
- * Round-Robin Scheduling
-
- static struct ip_vs_dest *
- ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
- {
- struct list_head *p, *q;
- struct ip_vs_dest *dest;
-
- IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
-
- write_lock(&svc->sched_lock);
- p = (struct list_head *)svc->sched_data;
- p = p->next;
- q = p;
- do {
-
- if (q == &svc->destinations) {
- q = q->next;
- continue;
- }
-
- dest = list_entry(q, struct ip_vs_dest, n_list);
- if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) > 0)
-
- goto out;
- q = q->next;
- } while (q != p);
- write_unlock(&svc->sched_lock);
- return NULL;
-
- out:
- svc->sched_data = q;
- write_unlock(&svc->sched_lock);
- IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
- "activeconns %d refcnt %d weight %d\n",
- NIPQUAD(dest->addr), ntohs(dest->port),
- atomic_read(&dest->activeconns),
- atomic_read(&dest->refcnt), atomic_read(&dest->weight));
-
- return dest;
- }
|
其他调度结构都是大同小异,后面就只分析核心算法了
加权轮叫调度
Weighted Round-Robin Scheduling
net/ipv4/ipvs/ip_vs_wrr.c-
- * Weighted Round-Robin Scheduling
-
- static struct ip_vs_dest *
- ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
- {
- struct ip_vs_dest *dest;
- struct ip_vs_wrr_mark *mark = svc->sched_data;
- struct list_head *p;
-
- IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
-
-
- * This loop will always terminate, because mark->cw in (0, max_weight]
- * and at least one server has its weight equal to max_weight.
-
- write_lock(&svc->sched_lock);
- p = mark->cl;
- while (1) {
- if (mark->cl == &svc->destinations) {
-
-
- if (mark->cl == mark->cl->next) {
-
- dest = NULL;
- goto out;
- }
-
- mark->cl = svc->destinations.next;
- mark->cw -= mark->di;
- if (mark->cw <= 0) {
- mark->cw = mark->mw;
-
- * Still zero, which means no available servers.
-
- if (mark->cw == 0) {
- mark->cl = &svc->destinations;
- IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
- "no available servers\n");
- dest = NULL;
- goto out;
- }
- }
- } else
- mark->cl = mark->cl->next;
-
- if (mark->cl != &svc->destinations) {
-
- dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
- if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) >= mark->cw) {
-
- break;
- }
- }
-
- if (mark->cl == p && mark->cw == mark->di) {
-
- It is only possible when all dests are OVERLOADED
- dest = NULL;
- goto out;
- }
- }
-
- IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
- "activeconns %d refcnt %d weight %d\n",
- NIPQUAD(dest->addr), ntohs(dest->port),
- atomic_read(&dest->activeconns),
- atomic_read(&dest->refcnt),
- atomic_read(&dest->weight));
-
- out:
- write_unlock(&svc->sched_lock);
- return dest;
- }
|
最小连接调度
Least-Connection Scheduling
net/ipv4/ipvs/ip_vs_lc.c-
- * Least Connection scheduling
-
- static struct ip_vs_dest *
- ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
- {
- struct ip_vs_dest *dest, *least = NULL;
- unsigned int loh = 0, doh;
-
- IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
-
-
- * Simply select the server with the least number of
- * (activeconns<<5) + inactconns
- * Except whose weight is equal to zero.
- * If the weight is equal to zero, it means that the server is
- * quiesced, the existing connections to the server still get
- * served, but no new connection is assigned to the server.
-
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
- atomic_read(&dest->weight) == 0)
- continue;
- doh = ip_vs_lc_dest_overhead(dest);
- if (!least || doh < loh) {
- least = dest;
- loh = doh;
- }
- }
-
- if (least)
- IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
- NIPQUAD(least->addr), ntohs(least->port),
- atomic_read(&least->activeconns),
- atomic_read(&least->inactconns));
-
- return least;
- }
|
加权最小连接调度
Weighted Least-Connection Scheduling
net/ipv4/ipvs/ip_vs_wlc.c-
- * Weighted Least Connection scheduling
-
- static struct ip_vs_dest *
- ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
- {
- struct ip_vs_dest *dest, *least;
- unsigned int loh, doh;
-
- IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
-
-
- * We calculate the load of each dest server as follows:
- * (dest overhead) / dest->weight
- *
- * Remember -- no floats in kernel mode!!!
- * The comparison of h1*w2 > h2*w1 is equivalent to that of
- * h1/w1 > h2/w2
- * if every weight is larger than zero.
- *
- * The server with weight=0 is quiesced and will not receive any
- * new connections.
-
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) > 0) {
- least = dest;
- loh = ip_vs_wlc_dest_overhead(least);
- goto nextstage;
- }
- }
- return NULL;
-
-
- * Find the destination with the least load.
-
- nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
- doh = ip_vs_wlc_dest_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
- least = dest;
- loh = doh;
- }
- }
-
- IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
- "activeconns %d refcnt %d weight %d overhead %d\n",
- NIPQUAD(least->addr), ntohs(least->port),
- atomic_read(&least->activeconns),
- atomic_read(&least->refcnt),
- atomic_read(&least->weight), loh);
-
- return least;
- }
|
基于局部性的最少链接
Locality-Based Least Connections Scheduling
net/ipv4/ipvs/ip_vs_lblc.c- static inline struct ip_vs_dest *
- __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
- {
- struct ip_vs_dest *dest, *least;
- int loh, doh;
-
-
- * We think the overhead of processing active connections is fifty
- * times higher than that of inactive connections in average. (This
- * fifty times might not be accurate, we will change it later.) We
- * use the following formula to estimate the overhead:
- * dest->activeconns*50 + dest->inactconns
- * and the load:
- * (dest overhead) / dest->weight
- *
- * Remember -- no floats in kernel mode!!!
- * The comparison of h1*w2 > h2*w1 is equivalent to that of
- * h1/w1 > h2/w2
- * if every weight is larger than zero.
- *
- * The server with weight=0 is quiesced and will not receive any
- * new connection.
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
- if (atomic_read(&dest->weight) > 0) {
- least = dest;
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
- goto nextstage;
- }
- }
- return NULL;
-
-
- * Find the destination with the least load.
-
- nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
-
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
- least = dest;
- loh = doh;
- }
- }
-
- IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
- "activeconns %d refcnt %d weight %d overhead %d\n",
- NIPQUAD(least->addr), ntohs(least->port),
- atomic_read(&least->activeconns),
- atomic_read(&least->refcnt),
- atomic_read(&least->weight), loh);
-
- return least;
- }
|
带复制的基于局部性最少链接
Locality-Based Least Connections with Replication Scheduling
net/ipv4/ipvs/ip_vs_lblcr.c- static inline struct ip_vs_dest *
- __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
- {
- struct ip_vs_dest *dest, *least;
- int loh, doh;
-
-
- * We think the overhead of processing active connections is fifty
- * times higher than that of inactive connections in average. (This
- * fifty times might not be accurate, we will change it later.) We
- * use the following formula to estimate the overhead:
- * dest->activeconns*50 + dest->inactconns
- * and the load:
- * (dest overhead) / dest->weight
- *
- * Remember -- no floats in kernel mode!!!
- * The comparison of h1*w2 > h2*w1 is equivalent to that of
- * h1/w1 > h2/w2
- * if every weight is larger than zero.
- *
- * The server with weight=0 is quiesced and will not receive any
- * new connection.
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
-
- if (atomic_read(&dest->weight) > 0) {
- least = dest;
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
- goto nextstage;
- }
- }
- return NULL;
-
-
- * Find the destination with the least load.
-
- nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
-
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
- least = dest;
- loh = doh;
- }
- }
-
- IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
- "activeconns %d refcnt %d weight %d overhead %d\n",
- NIPQUAD(least->addr), ntohs(least->port),
- atomic_read(&least->activeconns),
- atomic_read(&least->refcnt),
- atomic_read(&least->weight), loh);
-
- return least;
- }
-
-
-
- * If this destination server is overloaded and there is a less loaded
- * server, then return true.
-
- static inline int
- is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
- {
- if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
- struct ip_vs_dest *d;
-
- list_for_each_entry(d, &svc->destinations, n_list) {
- if (atomic_read(&d->activeconns)*2
- < atomic_read(&d->weight)) {
- return 1;
- }
- }
- }
- return 0;
- }
|
目标地址散列调度
Destination Hashing Scheduling
net/ipv4/ipvs/ip_vs_dh.c- */
- static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
- {
- int i;
- struct ip_vs_dh_bucket *b;
-
- b = tbl;
- for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
- if (b->dest) {
- atomic_dec(&b->dest->refcnt);
- b->dest = NULL;
- }
- b++;
- }
- }
-
-
- static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
- {
- struct ip_vs_dh_bucket *tbl;
-
-
- tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
- GFP_ATOMIC);
- if (tbl == NULL) {
- IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
- return -ENOMEM;
- }
- svc->sched_data = tbl;
- IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
- "current service\n",
- sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
-
-
- ip_vs_dh_assign(tbl, svc);
-
- return 0;
- }
-
-
- static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
- {
- struct ip_vs_dh_bucket *tbl = svc->sched_data;
-
-
- ip_vs_dh_flush(tbl);
-
-
- kfree(svc->sched_data);
- IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
- sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
-
- return 0;
- }
-
-
- static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
- {
- struct ip_vs_dh_bucket *tbl = svc->sched_data;
-
-
- ip_vs_dh_flush(tbl);
-
-
- ip_vs_dh_assign(tbl, svc);
-
- return 0;
- }
-
-
-
- * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- * consider that the server is overloaded here.
-
- static inline int is_overloaded(struct ip_vs_dest *dest)
- {
- return dest->flags & IP_VS_DEST_F_OVERLOAD;
- }
-
-
-
- * Destination hashing scheduling
-
- static struct ip_vs_dest *
- ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
- {
- struct ip_vs_dest *dest;
- struct ip_vs_dh_bucket *tbl;
- struct iphdr *iph = ip_hdr(skb);
-
- IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
-
- tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
- dest = ip_vs_dh_get(tbl, iph->daddr);
- if (!dest
- || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
- || atomic_read(&dest->weight) <= 0
- || is_overloaded(dest)) {
- return NULL;
- }
-
- IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
- "--> server %u.%u.%u.%u:%d\n",
- NIPQUAD(iph->daddr),
- NIPQUAD(dest->addr),
- ntohs(dest->port));
-
- return dest;
- }
|
源地址散列调度
Source Hashing Scheduling
net/ipv4/ipvs/ip_vs_sh.c-
- * Source Hashing scheduling
-
- static struct ip_vs_dest *
- ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
- {
- struct ip_vs_dest *dest;
- struct ip_vs_sh_bucket *tbl;
- struct iphdr *iph = ip_hdr(skb);
-
- IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
-
- tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
- dest = ip_vs_sh_get(tbl, iph->saddr);
- if (!dest
- || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
- || atomic_read(&dest->weight) <= 0
- || is_overloaded(dest)) {
- return NULL;
- }
-
- IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
- "--> server %u.%u.%u.%u:%d\n",
- NIPQUAD(iph->saddr),
- NIPQUAD(dest->addr),
- ntohs(dest->port));
-
- return dest;
- }
|
最短预期延时调度
Shortest Expected Delay Scheduling
net/ipv4/ipvs/ip_vs_sed.c-
- * Weighted Least Connection scheduling
-
- static struct ip_vs_dest *
- ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
- {
- struct ip_vs_dest *dest, *least;
- unsigned int loh, doh;
-
- IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
-
-
- * We calculate the load of each dest server as follows:
- * (server expected overhead) / dest->weight
- *
- * Remember -- no floats in kernel mode!!!
- * The comparison of h1*w2 > h2*w1 is equivalent to that of
- * h1/w1 > h2/w2
- * if every weight is larger than zero.
- *
- * The server with weight=0 is quiesced and will not receive any
- * new connections.
-
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) > 0) {
- least = dest;
- loh = ip_vs_sed_dest_overhead(least);
- goto nextstage;
- }
- }
- return NULL;
-
-
- * Find the destination with the least load.
-
- nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
- doh = ip_vs_sed_dest_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
- least = dest;
- loh = doh;
- }
- }
-
- IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
- "activeconns %d refcnt %d weight %d overhead %d\n",
- NIPQUAD(least->addr), ntohs(least->port),
- atomic_read(&least->activeconns),
- atomic_read(&least->refcnt),
- atomic_read(&least->weight), loh);
-
- return least;
- }
|
不排队调度
Never Queue Scheduling
net/ipv4/ipvs/ip_vs_nq.c-
- * Weighted Least Connection scheduling
-
- static struct ip_vs_dest *
- ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
- {
- struct ip_vs_dest *dest, *least = NULL;
- unsigned int loh = 0, doh;
-
- IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
-
-
- * We calculate the load of each dest server as follows:
- * (server expected overhead) / dest->weight
- *
- * Remember -- no floats in kernel mode!!!
- * The comparison of h1*w2 > h2*w1 is equivalent to that of
- * h1/w1 > h2/w2
- * if every weight is larger than zero.
- *
- * The server with weight=0 is quiesced and will not receive any
- * new connections.
-
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
-
- if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
- !atomic_read(&dest->weight))
- continue;
-
- doh = ip_vs_nq_dest_overhead(dest);
-
-
- if (atomic_read(&dest->activeconns) == 0) {
- least = dest;
- loh = doh;
- goto out;
- }
-
- if (!least ||
- (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight))) {
- least = dest;
- loh = doh;
- }
- }
-
- if (!least)
- return NULL;
-
- out:
- IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
- "activeconns %d refcnt %d weight %d overhead %d\n",
- NIPQUAD(least->addr), ntohs(least->port),
- atomic_read(&least->activeconns),
- atomic_read(&least->refcnt),
- atomic_read(&least->weight), loh);
-
- return least;
- }
|