在进行udp压测的时候,有时会报EAGAIN,udp报EAGAIN,有点不可思议,就跟了一下内核源码,发现了两点:
1:udp 确实有缓存,之前在网上看到他人的博客说,udp没有缓存,我看的是2.6.32.220版本的源码
2:udp在缓存满,或者端口不够的情况下确实会返回EAGAIN,可用netstat -an |more查看发送队列和接收队列
贴源码之前,先描述一下流程,udp的发送流程 sock_sendmsg-->__sock_sendmsg-->__sock_sendmsg_nosec,在函数__sock_sendmsg_nosec中有一行代码
sock->ops->sendmsg(iocb, sock, msg, size),这行代码就开始走分支了,tcp走的是tcp_sendmsg,udp走的是inet_sendmsg
再补充一点,如果是调用writev,socket对应的struct file_operations是socket_file_ops,调用流程是vfs_writev-->do_readv_writev-->do_sync_readv_writev,在do_sync_readv_writev中会调用传进来的参数fn,该参数执行的是socket_file_ops中的sock_aio_write-->do_sock_write-->__sock_sendmsg,到这里就接上了上面直接调用sendto接口时的流程
下面开始贴源码
inet_sendmsg
int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size)
{
struct sock *sk = sock->sk;
inet_rps_record_flow(sk);
/* We may need to bind the socket. */
if (!inet_sk(sk)->num && inet_autobind(sk))
return -EAGAIN;
/*udp调用udp_sendmsg,这里不会走tcp*/
return sk->sk_prot->sendmsg(iocb, sk, msg, size);
从上面可以看到,(!inet_sk(sk)->num && inet_autobind(sk))这个条件就是判断如果端口不足,就会报EAGAIN,net.ipv4.ip_local_port_range 这个内核参数可以优化
接下来就是调用udp_sendmsg,该函数比较长,简单描述一下,如果设置了cork或者msg_more参数,就会将多个数据聚集成一个udp包发送出去,有兴趣可以自己看代码,我们一般用的情形就是每调用一次,就生成一个udp包,然后发送,每调用一次,就生成一个udp包的流程是在该函数中调用ip_make_skb,然后调用udp_send_skb发送,接下来看ip_make_skb-->__ip_append_data
static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
struct inet_cork *cork,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
int mtu;
int copy;
int err;
int offset = 0;
unsigned int maxfraglen, fragheaderlen;
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
struct page *page = NULL;
int off = 0;
exthdrlen = transhdrlen ? rt->u.dst.header_len : 0;
length += exthdrlen;
transhdrlen += exthdrlen;
mtu = cork->fragsize;
hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
if (cork->length + length > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
return -EMSGSIZE;
}
/*
* transhdrlen > 0 means that this is the first fragment and we wish
* it won't be fragmented in the future.
*/
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
skb = skb_peek_tail(queue);
cork->length += length;
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->u.dst.dev->features & NETIF_F_UFO)) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
mtu, flags);
if (err)
goto error;
return 0;
}
/* So, what's going on in the loop below?
*
* We use calculated fragment length to generate chained skb,
* each of segments is IP fragment ready for sending to network after
* adding appropriate IP header.
*/
if (!skb)
goto alloc_new_skb;
while (length > 0) {
/* Check if the remaining data fits into current packet. */
copy = mtu - skb->len;
if (copy < length)
copy = maxfraglen - skb->len;
if (copy <= 0) {
char *data;
unsigned int datalen;
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
if (skb_prev)
fraggap = skb_prev->len - maxfraglen;
else
fraggap = 0;
/*
* If remaining data exceeds the mtu,
* we know we need more fragment(s).
*/
datalen = length + fraggap;
if (datalen > mtu - fragheaderlen)
datalen = maxfraglen - fragheaderlen;
fraglen = datalen + fragheaderlen;
if ((flags & MSG_MORE) &&
!(rt->u.dst.dev->features&NETIF_F_SG))
alloclen = mtu;
else
alloclen = datalen + fragheaderlen;
/* The last fragment gets additional space at tail.
* Note, with MSG_MORE we overallocate on fragments,
* because we have no idea what fragment will be
* the last.
*/
if (datalen == length + fraggap)
alloclen += rt->u.dst.trailer_len;
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len + 15,
(flags & MSG_DONTWAIT), &err);
} else {
skb = NULL;
if (atomic_read(&sk->sk_wmem_alloc) <=
2 * sk->sk_sndbuf)
skb = sock_wmalloc(sk,
alloclen + hh_len + 15, 1,
sk->sk_allocation);
if (unlikely(skb == NULL))
err = -ENOBUFS;
else
/* only the initial fragment is
time stamped */
ipc->shtx.flags = 0;
}
if (skb == NULL)
goto error;
/*
* Fill in the control structures
*/
skb->ip_summed = csummode;
skb->csum = 0;
skb_reserve(skb, hh_len);
*skb_tx(skb) = ipc->shtx;
/*
* Find where to start putting bytes.
*/
data = skb_put(skb, fraglen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
data += fragheaderlen;
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
data + transhdrlen, fraggap, 0);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
data += fraggap;
pskb_trim_unique(skb_prev, maxfraglen);
}
copy = datalen - transhdrlen - fraggap;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
}
offset += copy;
length -= datalen - fraggap;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
/*
* Put the packet on the pending queue.
*/
__skb_queue_tail(queue, skb);
continue;
}
if (copy > length)
copy = length;
if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
unsigned int off;
off = skb->len;
if (getfrag(from, skb_put(skb, copy),
offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
}
} else {
int i = skb_shinfo(skb)->nr_frags;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
unsigned int left;
if (page && (left = PAGE_SIZE - off) > 0) {
if (copy >= left)
copy = left;
if (page != frag->page) {
if (i == MAX_SKB_FRAGS) {
err = -EMSGSIZE;
goto error;
}
get_page(page);
skb_fill_page_desc(skb, i, page, off, 0);
frag = &skb_shinfo(skb)->frags[i];
}
} else if (i < MAX_SKB_FRAGS) {
if (copy > PAGE_SIZE)
copy = PAGE_SIZE;
page = alloc_pages(sk->sk_allocation, 0);
if (page == NULL) {
err = -ENOMEM;
goto error;
}
skb_fill_page_desc(skb, i, page, 0, 0);
frag = &skb_shinfo(skb)->frags[i];
} else {
err = -EMSGSIZE;
goto error;
}
if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
err = -EFAULT;
goto error;
}
off += copy;
frag->size += copy;
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
atomic_add(copy, &sk->sk_wmem_alloc);
}
offset += copy;
length -= copy;
}
return 0;
error:
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
return err;
}
__ip_append_data这个函数代码较长,其中有两个主要的地方,一个是调用ip_ufo_append_data,一个是调用sock_alloc_send_skb,这两个函数都会返回EAGAIN,并且这两个函数最终调用都是一样的,都是调用sock_alloc_send_skb-->sock_alloc_send_pskb,sock_alloc_send_pskb这个函数就是关键
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
unsigned long data_len, int noblock,
int *errcode)
{
struct sk_buff *skb;
gfp_t gfp_mask;
long timeo;
int err;
gfp_mask = sk->sk_allocation;
if (gfp_mask & __GFP_WAIT)
gfp_mask |= __GFP_REPEAT;
timeo = sock_sndtimeo(sk, noblock);
while (1) {
err = sock_error(sk);
if (err != 0)
goto failure;
err = -EPIPE;
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
skb = alloc_skb(header_len, gfp_mask);
if (skb) {
int npages;
int i;
/* No pages, we're done... */
if (!data_len)
break;
npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
skb->truesize += data_len;
skb_shinfo(skb)->nr_frags = npages;
for (i = 0; i < npages; i++) {
struct page *page;
skb_frag_t *frag;
page = alloc_pages(sk->sk_allocation, 0);
if (!page) {
err = -ENOBUFS;
skb_shinfo(skb)->nr_frags = i;
kfree_skb(skb);
goto failure;
}
frag = &skb_shinfo(skb)->frags[i];
frag->page = page;
frag->page_offset = 0;
frag->size = (data_len >= PAGE_SIZE ?
PAGE_SIZE :
data_len);
data_len -= PAGE_SIZE;
}
/* Full success... */
break;
}
err = -ENOBUFS;
goto failure;
}
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
if (!timeo)
goto failure;
if (signal_pending(current))
goto interrupted;
timeo = sock_wait_for_wmem(sk, timeo);
}
skb_set_owner_w(skb, sk);
return skb;
interrupted:
err = sock_intr_errno(timeo);
failure:
*errcode = err;
return NULL;
}
从上面的if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)这个条件可以看到,如果不满足,就会执行err = -EAGAIN;并且非阻塞的socket timeo是0,所以直接走到failure,错误号EAGAIN返回,到此,就可以看到udp socket发送可以返回EAGAIN,且udp是有缓存的
到此可以看到有两个关键点可能使得udp socket
再啰嗦一下sk->sk_wmem_alloc是当前已经分配的,sk->sk_sndbuf这个是缓存大小,每次申请一个skb,就会调用skb_set_owner_w,里面对sk->sk_wmem_alloc进行累加,
释放是调用sock_wfree,在skb_set_owner_w中会将sock_wfree设置到回调中。
可以调udp socket的缓存进行优化,并且内核参数也有对应的udp_mem udp_rmem_min udp_wmem_min这三个参数