summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet/tcp_offload.h
blob: 48f35ff66a1480c0170be5b44391d34ea806f639 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
/*-
 * Copyright (c) 2007, Chelsio Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Neither the name of the Chelsio Corporation nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * $FreeBSD$
 */

#ifndef _NETINET_TCP_OFFLOAD_H_
#define	_NETINET_TCP_OFFLOAD_H_

#ifndef _KERNEL
#error "no user-serviceable parts inside"
#endif

/*
 * A driver publishes that it provides offload services
 * by setting IFCAP_TOE in the ifnet. The offload connect
 * will bypass any further work if the interface that a
 * connection would use does not support TCP offload.
 *
 * The TOE API assumes that the tcp offload engine can offload the 
 * the entire connection from set up to teardown, with some provision 
 * being made to allowing the software stack to handle time wait. If
 * the device does not meet these criteria, it is the driver's responsibility
 * to overload the functions that it needs to in tcp_usrreqs and make
 * its own calls to tcp_output if it needs to do so.
 *
 * There is currently no provision for the device advertising the congestion
 * control algorithms it supports as there is currently no API for querying 
 * an operating system for the protocols that it has loaded. This is a desirable
 * future extension.
 *
 *
 *
 * It is assumed that individuals deploying TOE will want connections
 * to be offloaded without software changes so all connections on an
 * interface providing TOE are offloaded unless the the SO_NO_OFFLOAD 
 * flag is set on the socket.
 *
 *
 * The toe_usrreqs structure constitutes the TOE driver's 
 * interface to the TCP stack for functionality that doesn't
 * interact directly with userspace. If one wants to provide
 * (optional) functionality to do zero-copy to/from
 * userspace one still needs to override soreceive/sosend 
 * with functions that fault in and pin the user buffers.
 *
 * + tu_send
 *   - tells the driver that new data may have been added to the 
 *     socket's send buffer - the driver should not fail if the
 *     buffer is in fact unchanged
 *   - the driver is responsible for providing credits (bytes in the send window)
 *     back to the socket by calling sbdrop() as segments are acknowledged.
 *   - The driver expects the inpcb lock to be held - the driver is expected
 *     not to drop the lock. Hence the driver is not allowed to acquire the
 *     pcbinfo lock during this call.
 *
 * + tu_rcvd
 *   - returns credits to the driver and triggers window updates
 *     to the peer (a credit as used here is a byte in the peer's receive window)
 *   - the driver is expected to determine how many bytes have been 
 *     consumed and credit that back to the card so that it can grow
 *     the window again by maintaining its own state between invocations.
 *   - In principle this could be used to shrink the window as well as
 *     grow the window, although it is not used for that now.
 *   - this function needs to correctly handle being called any number of
 *     times without any bytes being consumed from the receive buffer.
 *   - The driver expects the inpcb lock to be held - the driver is expected
 *     not to drop the lock. Hence the driver is not allowed to acquire the
 *     pcbinfo lock during this call.
 *
 * + tu_disconnect
 *   - tells the driver to send FIN to peer
 *   - driver is expected to send the remaining data and then do a clean half close
 *   - disconnect implies at least half-close so only send, reset, and detach
 *     are legal
 *   - the driver is expected to handle transition through the shutdown
 *     state machine and allow the stack to support SO_LINGER.
 *   - The driver expects the inpcb lock to be held - the driver is expected
 *     not to drop the lock. Hence the driver is not allowed to acquire the
 *     pcbinfo lock during this call.
 *
 * + tu_reset
 *   - closes the connection and sends a RST to peer
 *   - driver is expectd to trigger an RST and detach the toepcb
 *   - no further calls are legal after reset
 *   - The driver expects the inpcb lock to be held - the driver is expected
 *     not to drop the lock. Hence the driver is not allowed to acquire the
 *     pcbinfo lock during this call.
 *
 *   The following fields in the tcpcb are expected to be referenced by the driver:
 *	+ iss
 *	+ rcv_nxt
 *	+ rcv_wnd
 *	+ snd_isn
 *	+ snd_max
 *	+ snd_nxt
 *	+ snd_una
 *	+ t_flags
 *	+ t_inpcb
 *	+ t_maxseg
 *	+ t_toe
 *
 *   The following fields in the inpcb are expected to be referenced by the driver:
 *	+ inp_lport
 *	+ inp_fport
 *	+ inp_laddr
 *	+ inp_fport
 *	+ inp_socket
 *	+ inp_ip_tos
 *
 *   The following fields in the socket are expected to be referenced by the
 *   driver:
 *	+ so_comp
 *	+ so_error
 *	+ so_linger
 *	+ so_options
 *	+ so_rcv
 *	+ so_snd
 *	+ so_state
 *	+ so_timeo
 *
 *   These functions all return 0 on success and can return the following errors
 *   as appropriate:
 *	+ EPERM:
 *	+ ENOBUFS: memory allocation failed
 *	+ EMSGSIZE: MTU changed during the call
 *	+ EHOSTDOWN:
 *	+ EHOSTUNREACH:
 *	+ ENETDOWN:
 *	* ENETUNREACH: the peer is no longer reachable
 *
 * + tu_detach
 *   - tells driver that the socket is going away so disconnect
 *     the toepcb and free appropriate resources
 *   - allows the driver to cleanly handle the case of connection state
 *     outliving the socket
 *   - no further calls are legal after detach
 *   - the driver is expected to provide its own synchronization between
 *     detach and receiving new data.
 * 
 * + tu_syncache_event
 *   - even if it is not actually needed, the driver is expected to
 *     call syncache_add for the initial SYN and then syncache_expand
 *     for the SYN,ACK
 *   - tells driver that a connection either has not been added or has 
 *     been dropped from the syncache
 *   - the driver is expected to maintain state that lives outside the 
 *     software stack so the syncache needs to be able to notify the
 *     toe driver that the software stack is not going to create a connection
 *     for a received SYN
 *   - The driver is responsible for any synchronization required between
 *     the syncache dropping an entry and the driver processing the SYN,ACK.
 * 
 */
struct toe_usrreqs {
	int (*tu_send)(struct tcpcb *tp);
	int (*tu_rcvd)(struct tcpcb *tp);
	int (*tu_disconnect)(struct tcpcb *tp);
	int (*tu_reset)(struct tcpcb *tp);
	void (*tu_detach)(struct tcpcb *tp);
	void (*tu_syncache_event)(int event, void *toep);
};

/*
 * Proxy for struct tcpopt between TOE drivers and TCP functions.
 */
struct toeopt {
	u_int64_t	to_flags;	/* see tcpopt in tcp_var.h */
	u_int16_t	to_mss;		/* maximum segment size */
	u_int8_t	to_wscale;	/* window scaling */

	u_int8_t	_pad1;		/* explicit pad for 64bit alignment */
	u_int32_t	_pad2;		/* explicit pad for 64bit alignment */
	u_int64_t	_pad3[4];	/* TBD */
};

#define	TOE_SC_ENTRY_PRESENT		1	/* 4-tuple already present */
#define	TOE_SC_DROP			2	/* connection was timed out */

/*
 * Because listen is a one-to-many relationship (a socket can be listening 
 * on all interfaces on a machine some of which may be using different TCP
 * offload devices), listen uses a publish/subscribe mechanism. The TCP
 * offload driver registers a listen notification function with the stack.
 * When a listen socket is created all TCP offload devices are notified
 * so that they can do the appropriate set up to offload connections on the
 * port to which the socket is bound. When the listen socket is closed,
 * the offload devices are notified so that they will stop listening on that
 * port and free any associated resources as well as sending RSTs on any
 * connections in the SYN_RCVD state.
 *
 */

typedef	void	(*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
typedef	void	(*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);

EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);

/*
 * Check if the socket can be offloaded by the following steps:
 * - determine the egress interface
 * - check the interface for TOE capability and TOE is enabled
 * - check if the device has resources to offload the connection
 */
int	tcp_offload_connect(struct socket *so, struct sockaddr *nam);

/*
 * The tcp_output_* routines are wrappers around the toe_usrreqs calls
 * which trigger packet transmission. In the non-offloaded case they
 * translate to tcp_output. The tcp_offload_* routines notify TOE
 * of specific events. I the non-offloaded case they are no-ops.
 *
 * Listen is a special case because it is a 1 to many relationship
 * and there can be more than one offload driver in the system.
 */

/*
 * Connection is offloaded
 */
#define	tp_offload(tp)		((tp)->t_flags & TF_TOE)

/*
 * hackish way of allowing this file to also be included by TOE
 * which needs to be kept ignorant of socket implementation details
 */
#ifdef _SYS_SOCKETVAR_H_
/*
 * The socket has not been marked as "do not offload"
 */
#define	SO_OFFLOADABLE(so)	((so->so_options & SO_NO_OFFLOAD) == 0)

static __inline int
tcp_output_connect(struct socket *so, struct sockaddr *nam)
{
	struct tcpcb *tp = sototcpcb(so);
	int error;

	/*
	 * If offload has been disabled for this socket or the 
	 * connection cannot be offloaded just call tcp_output
	 * to start the TCP state machine.
	 */
#ifndef TCP_OFFLOAD_DISABLE	
	if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
#endif		
		error = tcp_output(tp);
	return (error);
}

static __inline int
tcp_output_send(struct tcpcb *tp)
{

#ifndef TCP_OFFLOAD_DISABLE
	if (tp_offload(tp))
		return (tp->t_tu->tu_send(tp));
#endif
	return (tcp_output(tp));
}

static __inline int
tcp_output_rcvd(struct tcpcb *tp)
{

#ifndef TCP_OFFLOAD_DISABLE
	if (tp_offload(tp))
		return (tp->t_tu->tu_rcvd(tp));
#endif
	return (tcp_output(tp));
}

static __inline int
tcp_output_disconnect(struct tcpcb *tp)
{

#ifndef TCP_OFFLOAD_DISABLE
	if (tp_offload(tp))
		return (tp->t_tu->tu_disconnect(tp));
#endif
	return (tcp_output(tp));
}

static __inline int
tcp_output_reset(struct tcpcb *tp)
{

#ifndef TCP_OFFLOAD_DISABLE
	if (tp_offload(tp))
		return (tp->t_tu->tu_reset(tp));
#endif
	return (tcp_output(tp));
}

static __inline void
tcp_offload_detach(struct tcpcb *tp)
{

#ifndef TCP_OFFLOAD_DISABLE
	if (tp_offload(tp))
		tp->t_tu->tu_detach(tp);
#endif	
}

static __inline void
tcp_offload_listen_open(struct tcpcb *tp)
{

#ifndef TCP_OFFLOAD_DISABLE
	if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
		EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
#endif	
}

static __inline void
tcp_offload_listen_close(struct tcpcb *tp)
{

#ifndef TCP_OFFLOAD_DISABLE
	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
#endif	
}
#undef SO_OFFLOADABLE
#endif /* _SYS_SOCKETVAR_H_ */
#undef tp_offload

void tcp_offload_twstart(struct tcpcb *tp);
struct tcpcb *tcp_offload_close(struct tcpcb *tp);
struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error);

#endif /* _NETINET_TCP_OFFLOAD_H_ */