1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
| /**
* sock_sendpage - Socket发送页面(零拷贝)
* @sock: Socket结构
* @page: 要发送的页面
* @offset: 页面内偏移
* @size: 发送大小
* @flags: 发送标志
*
* 通过零拷贝技术发送页面数据
* 返回值:发送的字节数或错误码
*/
ssize_t sock_sendpage(struct socket *sock, struct page *page, int offset,
size_t size, int flags)
{
ssize_t res;
struct msghdr msg = {.msg_flags = flags};
struct kvec iov;
char *kaddr = kmap(page);
iov.iov_base = kaddr + offset;
iov.iov_len = size;
iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size);
res = sock_sendmsg(sock, &msg);
/*
* 如果这失败了,我们应该返回实际的错误码,但是
* 某些应用程序可能依赖返回的正字节数
*/
kunmap(page);
return res;
}
/**
* tcp_sendpage_locked - TCP锁定发送页面
* @sk: TCP套接字
* @page: 页面
* @offset: 偏移
* @size: 大小
* @flags: 标志
*
* 在已持有套接字锁的情况下发送页面
* 返回值:发送字节数或错误码
*/
ssize_t tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
int err;
ssize_t copied = 0;
/* 检查连接状态 */
if (flags & MSG_OOB) {
err = tcp_send_urg_data(sk, page, offset, size, flags);
goto out_err;
}
/* 如果连接已关闭,返回错误 */
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
mss_now = tcp_send_mss(sk, &size_goal, flags);
copied = 0;
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
while (size > 0) {
struct sk_buff *skb;
size_t copy, i;
bool can_coalesce;
if (!tcp_send_head(sk) || (copy = size_goal - tcp_send_head(sk)->len) <= 0) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
tcp_rtx_and_write_queues_empty(sk));
if (!skb)
goto wait_for_memory;
/*
* 检查是否需要插入TSO选项和时间戳选项
*/
skb_entail(sk, skb);
copy = size_goal;
}
if (copy > size)
copy = size;
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, page, offset);
if (!can_coalesce && i >= sysctl_max_skb_frags) {
tcp_mark_push(tp, skb);
goto new_segment;
}
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
if (can_coalesce) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
get_page(page);
skb_fill_page_desc(skb, i, page, offset, copy);
}
if (!(flags & MSG_NO_SHARED_FRAGS))
skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
skb->ip_summed = CHECKSUM_PARTIAL;
WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
copied += copy;
offset += copy;
size -= copy;
if (!size)
goto out;
if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
err = sk_stream_wait_memory(sk, &timeo);
if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
out:
if (copied) {
tcp_tx_timestamp(sk, sk->sk_tsflags);
if (!(flags & MSG_SENDPAGE_NOTLAST))
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
return copied;
do_error:
tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
if (copied)
goto out;
out_err:
/* 确保我们返回一个连接错误 */
if (sk->sk_err)
err = -sk->sk_err;
else if (sk->sk_shutdown & SEND_SHUTDOWN)
err = -EPIPE;
else if (sk->sk_state == TCP_CLOSE)
err = -ENOTCONN;
return err;
}
|