由于要做一个类似LVS的包转发模块,研究了LVS的架构和代码,下面这个系列会做一个总结。首先推荐下这个blog http://yfydz.cublog.cn 里面对LVS, IPSec的讲解非常不错

几个重要的数据结构如下:

ip_vs_conn:一个连接由N元组构成,包括 caddr (客户端地址cip), vaddr (服务虚拟地址vip), daddr (目的realserver地址dip), cport (客户端连接端口), vport (服务虚拟端口), dport (目的realserver端口), protocol (协议)

ip_vs_service:代表一个虚拟服务。LVS中虚拟服务代表一个虚拟IP和端口,作为服务的入口,后面跟着一些realserver,在这些realserver之间做负载平衡。ip_vs_service中包括了protocol, addr, port。struct list_head destinations, __u32 num_dests则代表了后面realserver的链表和个数

ip_vs_dest:代表一个realserver。addr, port, weight分别代表了realserver的ip, port, 权重。struct dst_entry *dst_cache代表了从LVS到realserver的路由缓存项,在我看来这个应该只对NAT, tunnel模式有效。vport, vaddr, protocol代表了虚拟服务地址,端口和协议

ip_vs_scheduler:所有调度器的基类,对ip_vs_service进行调度,其最重要的方法是 struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, const struct sk_buff* skb),从ip_vs_service下的ip_vs_dest数组中选取一个出来返回


static int __init ip_vs_init(void)用来初始化ipvs.ko,也就是LVS的核心模块:

ip_vs_control_init调用nf_register_sockopt注册struct nf_sockopt_ops结构,ip_vs_genl_register注册struct genl_ops ip_vs_genl_ops[]数组,这是通过netlink进行控制的命令结构。

ip_vs_protocol_init依次注册了ip_vs_protocol_tcp, ip_vs_protocol_udp, ip_vs_protocol_ah, ip_vs_protocol_esp四个协议

ip_vs_conn_init首先调用vmalloc分配一块大的内存(64k)区域用于存放连接的哈希表的key数组,也就是说有4096和list_head。

LVS最后调用nf_register_hooks,向netfilter注册自己的钩子结构。LVS一共有4个钩子(不算IPV6),

static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
     /* After packet filtering, forward packet through VS/DR, VS/TUN,
      * or VS/NAT(change destination), so that filtering rules can be
      * applied to IPVS. */
     {   
         .hook       = ip_vs_in,
         .owner      = THIS_MODULE,
         .pf     = PF_INET,
         .hooknum        = NF_INET_LOCAL_IN,
         .priority       = 100,
     },
     /* After packet filtering, change source only for VS/NAT */
     {
         .hook       = ip_vs_out,
         .owner      = THIS_MODULE,
         .pf     = PF_INET,
         .hooknum        = NF_INET_FORWARD,
         .priority       = 100,
     },
     /* After packet filtering (but before ip_vs_out_icmp), catch icmp
      * destined for 0.0.0.0/0, which is for incoming IPVS connections */
     {
         .hook       = ip_vs_forward_icmp,
         .owner      = THIS_MODULE,
         .pf     = PF_INET,
         .hooknum        = NF_INET_FORWARD,
         .priority       = 99,
     },    /* Before the netfilter connection tracking, exit from POST_ROUTING */
     {
         .hook       = ip_vs_post_routing,
         .owner      = THIS_MODULE,
         .pf     = PF_INET,
         .hooknum        = NF_INET_POST_ROUTING,
         .priority       = NF_IP_PRI_NAT_SRC-1,
     },};

LVS无论是VS/DR, VS/TUN, VS/NAT哪种模式,由于vip配置在LVS上,因此访问vip的流量首先会走到NF_INET_LOCAL_IN,从而调用ip_vs_in
static unsigned int
 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
      const struct net_device *in, const struct net_device *out,
      int (*okfn)(struct sk_buff *))
 {    ...
    // LVS ip_vs_in只处理发给本机的报文
    if (unlikely(skb->pkt_type != PACKET_HOST)) {
         IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
                   skb->pkt_type,
                   iph.protocol,
                   IP_VS_DBG_ADDR(af, &iph.daddr));
         return NF_ACCEPT;
     }    ...
    /*
      * Check if the packet belongs to an existing connection entry
      */    // conn_in_get由协议本身实现,对于TCP而言,调用tcp_conn_in_get得到一个ip_vs_conn
     cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);    if (unlikely(!cp)) {
         int v;
     
         /* For local client packets, it could be a response */
         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);  // 查看是否是一个出去的连接
         if (cp)
             return handle_response(af, skb, pp, cp, iph.len); // 主要是执行snat
     
         if (!pp->conn_schedule(af, skb, pp, &v, &cp))    // 执行tcp_conn_schedule,TCP协议的调度就是为client找一个realserver,然后把这个conn保存下来,下次就直接基于这个ip_vs_conn转发了
             return v;   
     }    ...
    /* Check the server status */
     if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
         /* the destination server is not available */
         if (sysctl_ip_vs_expire_nodest_conn) {
             /* try to expire the connection immediately */
             ip_vs_conn_expire_now(cp);
         }
         /* don't restart its timer, and silently
            drop the packet. */
         __ip_vs_conn_put(cp);  // 如果后面的realserver失效,那么drop这个ip_vs_conn
         return NF_DROP;
     }    ip_vs_in_stats(cp, skb);
     restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);  // 调用tcp_state_transition,改变连接的自动机状态
     if (cp->packet_xmit)
         ret = cp->packet_xmit(skb, cp, pp);  // 根据模式不同,调用不同的发送方法 e.g. NAT调用ip_vs_nat_xmit, DR调用ip_vs_dr_xmit
         /* do not touch skb anymore */
     else {
         IP_VS_DBG_RL("warning: packet_xmit is null");
         ret = NF_ACCEPT;
     }    ...
}

LVS的VS/DR, VS/TUN都是单臂模式,只有VS/NAT是双臂模式。在VS/NAT模式下,LVS会作为realserver回包的next hop,因此在NF_IP_FORWARD上注册ip_vs_out,用来处理NAT模式下的回包

static unsigned int
 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
       const struct net_device *in, const struct net_device *out,
       int (*okfn)(struct sk_buff *))
 {
     struct ip_vs_iphdr iph;
     struct ip_vs_protocol *pp;
     struct ip_vs_conn *cp;
     int af;    ....
    ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);  // 填充ip_vs_iphdr的IP头
    if (unlikely(iph.protocol == IPPROTO_ICMP)) {  // 这部分代码用来处理icmp报文,主要逻辑在ip_vs_out_icmp上,该函数用来处理outgoing方向的icmp
         int related, verdict = ip_vs_out_icmp(skb, &related);        if (related)
             return verdict;
         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
     }    ....
    if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) {   // 如果是IP分片的包,那么调用ip_vs_gather_frags先尝试整合成一个完整包,具体请参考内核IP层的frag/defrag的相关代码
         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
             return NF_STOLEN;

         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
     }    /*
      * Check if the packet belongs to an existing entry
      */
     cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);  // 查找是否有已有连接

     if (unlikely(!cp)) {
         if (sysctl_ip_vs_nat_icmp_send &&
             (pp->protocol == IPPROTO_TCP ||
              pp->protocol == IPPROTO_UDP)) {
             __be16 _ports[2], *pptr;

             pptr = skb_header_pointer(skb, iph.len,
                           sizeof(_ports), _ports);
             if (pptr == NULL)
                 return NF_ACCEPT;   /* Not for me */
             if (ip_vs_lookup_real_service(af, iph.protocol,
                               &iph.saddr,
                               pptr[0])) {   // 查看这个realserver是否在LVS的hash表中,如果是真实的realserver,返回一个ICMP不可达                /*
                  * Notify the real server: there is no
                  * existing entry if it is not RST
                  * packet or not TCP packet.
                  */
                 if (iph.protocol != IPPROTO_TCP
                     || !is_tcp_reset(skb, iph.len)) {
                         icmp_send(skb,
                               ICMP_DEST_UNREACH,
                               ICMP_PORT_UNREACH, 0);
                     return NF_DROP;
                 }
             }
         }
         IP_VS_DBG_PKT(12, pp, skb, 0,
                   "packet continues traversal as normal");
         return NF_ACCEPT;
     }    return handle_response(af, skb, pp, cp, iph.len);  // handle_response真正去做SNAT
 }
static unsigned int 
 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
         struct ip_vs_conn *cp, int ihl)
 {    if (!skb_make_writable(skb, ihl))  /* 如果要修改skb的话,当前内核版本需要先判断skb_make_writable */
         goto drop;    /* mangle the packet */
     if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))  /* 对TCP而言,这里是调用tcp_snat_handler, 主要功能是修改了tcp头之后再做下checksum */
         goto drop;    ip_hdr(skb)->saddr = cp->vaddr.ip;  /* SNAT, 把包的源IP替换为virtual IP */
     ip_send_check(ip_hdr(skb));  /* 对IP头做checksum */    /* For policy routing, packets originating from this
     * machine itself may be routed differently to packets
      * passing through.  We want this packet to be routed as
      * if it came from this machine itself.  So re-compute
      * the routing information.
      */        if (ip_route_me_harder(skb, RTN_LOCAL) != 0)  /* 由于源IP变成了本地IP,而不是之前的转发包,需要重新计算路由 */
             goto drop;    ip_vs_out_stats(cp, skb);
     ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); /* 对TCP而言,调用tcp_state_transition */
     ip_vs_conn_put(cp);

     skb->ipvs_property = 1;  /* 标记这个skb已经被LVS处理过 */

     LeaveFunction(11);
     return NF_ACCEPT;

 drop:
     ip_vs_conn_put(cp);
     kfree_skb(skb);
     return NF_STOLEN;
 }
LVS在NF_INET_POST_ROUTING chain上还注册了一个优先级为NF_IP_PRI_NAT_SRC - 1的hook函数ip_vs_post_routing。该函数在iptables SNAT之前执行,检查LVS是否处理过该skb,如果处理过则跳过下面的netfilter hook点
/*
  *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
  *      chain, and is used for VS/NAT.
  *      It detects packets for VS/NAT connections and sends the packets
  *      immediately. This can avoid that iptable_nat mangles the packets
  *      for VS/NAT.
  */     
 static unsigned int ip_vs_post_routing(unsigned int hooknum,
                        struct sk_buff *skb,
                        const struct net_device *in,
                        const struct net_device *out,
                        int (*okfn)(struct sk_buff *))
 {
     if (!skb->ipvs_property)
         return NF_ACCEPT;
     /* The packet was sent from IPVS, exit this chain */
     return NF_STOP;  
 }

netfilter框架下,NF_HOOK宏会调用到nf_hook_slow,进而调用nf_iterate,即对于特定PF下的特定HOOKNUM,按优先级遍历上面注册的所有hook函数,只有当所有函数都返回NF_ACCEPT,或者有任意函数返回NF_STOP,整个nf_iterate才会返回NF_ACCEPT。与NF_ACCEPT不同的是,NF_STOP的语义会忽略该挂载点下其他优先级的函数。