summaryrefslogtreecommitdiffstats
path: root/cpukit/libblock/include/rtems/bdbuf.h
blob: 16f86fcdd32a6f7f109b16b02b4d31bfe6668c97 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
/**
 * @file
 *
 * @ingroup rtems_bdbuf
 *
 * Block device buffer management.
 */
 
/*
 * Copyright (C) 2001 OKTET Ltd., St.-Petersburg, Russia
 * Author: Victor V. Vengerov <vvv@oktet.ru>
 *
 * Copyright (C) 2008,2009 Chris Johns <chrisj@rtems.org>
 *    Rewritten to remove score mutex access. Fixes many performance
 *    issues.
      Change to support demand driven variable buffer sizes.
 *
 * @(#) bdbuf.h,v 1.9 2005/02/02 00:06:18 joel Exp
 */

#ifndef _RTEMS_BDBUF_H
#define _RTEMS_BDBUF_H

#include <rtems.h>
#include <rtems/libio.h>
#include <rtems/chain.h>

#include <rtems/blkdev.h>
#include <rtems/diskdevs.h>

#ifdef __cplusplus
extern "C" {
#endif
	
/**
 * @defgroup rtems_libblock Block Device Library
 */

/**
 * @defgroup rtems_bdbuf Block Device Buffer Management
 *
 * @ingroup rtems_libblock
 *
 * The Block Device Buffer Management implements a cache between the disk
 * devices and file systems. The code provides read ahead and write queuing to
 * the drivers and fast cache look up using an AVL tree.
 *
 * The block size used by a file system can be set at runtime and must be a
 * multiple of the disk device block size. The disk device's physical block
 * size is called the media block size. The file system can set the block size
 * it uses to a larger multiple of the media block size. The driver must be
 * able to handle buffers sizes larger than one media block.
 *
 * The user configures the amount of memory to be used as buffers in the cache,
 * and the minimum and maximum buffer size. The cache will allocate additional
 * memory for the buffer descriptors and groups. There are enough buffer
 * descriptors allocated so all the buffer memory can be used as minimum sized
 * buffers.
 *
 * The cache is a single pool of buffers. The buffer memory is divided into
 * groups where the size of buffer memory allocated to a group is the maximum
 * buffer size. A group's memory can be divided down into small buffer sizes
 * that are a multiple of 2 of the minimum buffer size. A group is the minumum
 * allocation unit for buffers of a specific size. If a buffer of maximum size
 * is request the group will have a single buffer. If a buffer of minium size
 * is requested the group is divided into minimum sized buffers and the
 * remaining buffers are held ready for use. A group keeps track of which
 * buffers are with a file system or driver and groups who have buffer in use
 * cannot be realloced. Groups with no buffers in use can be taken and
 * realloced to a new size. This is how buffers of different sizes move around
 * the cache.

 * The buffers are held in various lists in the cache. All buffers follow this
 * state machine:
 *                                  
 * @dot
 * digraph g {
 *   ready [label="Ready\nRead Ahead"];
 *   transfer [label="Transfer"];
 *   accessed [label="Accessed\nAccessed Modified"];
 *   modified [label="Modified\nSynchronized"];
 *   cached [label="Cached"];
 *   ready -> transfer [label="Read\nRead Ahead"];
 *   transfer -> ready [label="Read Ahead Complete"];
 *   ready -> accessed [label="Get"];
 *   transfer -> accessed [label="Read or Write\nComplete"];
 *   transfer -> cached [label="Read or Write\nComplete"];
 *   accessed -> cached [label="Release"];
 *   cached -> accessed [label="Get"];
 *   modified -> accessed [label="Get"];
 *   accessed -> modified [label="Modified"];
 *   accessed -> transfer [label="Swap"];
 * }
 * @enddot
 *         
 * Empty buffers are added to the ready list and removed from this queue when a
 * caller requests a buffer. This is referred to as getting a buffer in the
 * code and the event get in the state diagram. The buffer is assigned to a
 * block and inserted to the AVL based on the block/device key. If the block is
 * to be read by the user and not in the cache (ready) it is transfered from
 * the disk into memory. If no ready buffers exist the buffer is taken from the
 * LRU list. If no buffers are on the LRU list the modified list is check. If
 * no buffers are on the modified list the request blocks. If buffers are on
 * the modified list the buffers hold timer is expired and the swap out task
 * woken.
 *
 * A block being accessed is given to the file system layer and not accessable
 * to another requester until released back to the cache. The same goes to a
 * buffer in the transfer state. The transfer state means being read or
 * written. If the file system has modifed the block and releases it as
 * modified it placed on the cache's modified list and a hold timer
 * initialised. The buffer is held for the hold time before being written to
 * disk. Buffers are held for a configurable period of time on the modified
 * list as a write sets the state to transfer and this locks the buffer out
 * from the file system until the write completes. Buffers are often accessed
 * and modified in a series of small updates so if sent to the disk when
 * released as modified the user would have to block waiting until it had been
 * written. This would be a performance problem.
 *
 * The code performs mulitple block reads and writes. Multiple block reads or
 * read ahead increases performance with hardware that supports it. It also
 * helps with a large cache as the disk head movement is reduced. It how-ever
 * is a speculative operation so excessive use can remove valuable and needed
 * blocks from the cache. The get call knows if a read is a for the file system
 * or if it is a read ahead get. If the get is for a read ahead block and the
 * block is already in the cache or no ready buffers are available the read
 * ahead is stopped. The transfer occurs with the blocks so far. If a buffer is
 * in the read ahead state and release it is placed on the ready list rather
 * than the LRU list. This means these buffers are used before buffers used by
 * the file system.
 *
 * The cache has the following lists of buffers:
 *  - @c ready: Empty buffers created when the pool is initialised.
 *  - @c modified: Buffers waiting to be written to disk.
 *  - @c sync: Buffers to be synced to disk.
 *  - @c lru: Accessed buffers released in least recently used order.
 *
 * The cache scans the ready list then the LRU list for a suitable buffer in
 * this order. A suitable buffer is one that matches the same allocation size
 * as the device the buffer is for. The a buffer's group has no buffers in use
 * with the file system or driver the group is reallocated. This means the
 * buffers in the group are invalidated, resized and placed on the ready queue.
 * There is a performance issue with this design. The reallocation of a group
 * may forced recently accessed buffers out of the cache when they should
 * not. The design should be change to have groups on a LRU list if they have
 * no buffers in use.
 *
 * @{
 */

/**
 * State of a buffer in the cache.
 */
typedef enum
{
  RTEMS_BDBUF_STATE_EMPTY = 0,            /**< Not in use. */
  RTEMS_BDBUF_STATE_READ_AHEAD = 1,       /**< Holds read ahead data only */
  RTEMS_BDBUF_STATE_CACHED = 2,           /**< In the cache and available */
  RTEMS_BDBUF_STATE_ACCESS = 3,           /**< The user has the buffer */
  RTEMS_BDBUF_STATE_MODIFIED = 4,         /**< In the cache but modified */
  RTEMS_BDBUF_STATE_ACCESS_MODIFIED = 5,  /**< With the user but modified */
  RTEMS_BDBUF_STATE_SYNC = 6,             /**< Requested to be sync'ed */
  RTEMS_BDBUF_STATE_TRANSFER = 7          /**< Being transferred to or from disk */
} rtems_bdbuf_buf_state;

/**
 * Forward reference to the block.
 */
struct rtems_bdbuf_group;
typedef struct rtems_bdbuf_group rtems_bdbuf_group;

/**
 * To manage buffers we using buffer descriptors (BD). A BD holds a buffer plus
 * a range of other information related to managing the buffer in the cache. To
 * speed-up buffer lookup descriptors are organized in AVL-Tree. The fields
 * 'dev' and 'block' are search keys.
 */
typedef struct rtems_bdbuf_buffer
{
  rtems_chain_node link;       /**< Link the BD onto a number of lists. */

  struct rtems_bdbuf_avl_node
  {
    signed char                cache;  /**< Cache */
    struct rtems_bdbuf_buffer* left;   /**< Left Child */
    struct rtems_bdbuf_buffer* right;  /**< Right Child */
    signed char                bal;    /**< The balance of the sub-tree */
  } avl;

  dev_t             dev;        /**< device number */
  rtems_blkdev_bnum block;      /**< block number on the device */

  unsigned char*    buffer;     /**< Pointer to the buffer memory area */
  int               error;      /**< If not 0 indicate an error value (errno)
                                 * which can be used by user later */

  volatile rtems_bdbuf_buf_state state;  /**< State of the buffer. */

  volatile uint32_t  waiters;    /**< The number of threads waiting on this
                                  * buffer. */
  rtems_bdbuf_group* group;      /**< Pointer to the group of BDs this BD is
                                  * part of. */
  volatile uint32_t  hold_timer; /**< Timer to indicate how long a buffer
                                  * has been held in the cache modified. */
} rtems_bdbuf_buffer;

/**
 * A group is a continuous block of buffer descriptors. A group covers the
 * maximum configured buffer size and is the allocation size for the buffers to
 * a specific buffer size. If you allocate a buffer to be a specific size, all
 * buffers in the group, if there are more than 1 will also be that size. The
 * number of buffers in a group is a multiple of 2, ie 1, 2, 4, 8, etc.
 */
struct rtems_bdbuf_group
{
  rtems_chain_node    link;          /**< Link the groups on a LRU list if they
                                      * have no buffers in use. */
  size_t              bds_per_group; /**< The number of BD allocated to this
                                      * group. This value must be a multiple of
                                      * 2. */
  uint32_t            users;         /**< How many users the block has. */
  rtems_bdbuf_buffer* bdbuf;         /**< First BD this block covers. */
};

/**
 * Buffering configuration definition. See confdefs.h for support on using this
 * structure.
 */
typedef struct rtems_bdbuf_config {
  uint32_t            max_read_ahead_blocks;   /**< Number of blocks to read
                                                * ahead. */
  uint32_t            max_write_blocks;        /**< Number of blocks to write
                                                * at once. */
  rtems_task_priority swapout_priority;        /**< Priority of the swap out
                                                * task. */
  uint32_t            swapout_period;          /**< Period swapout checks buf
                                                * timers. */
  uint32_t            swap_block_hold;         /**< Period a buffer is held. */
  uint32_t            swapout_workers;         /**< The number of worker
                                                * threads for the swapout
                                                * task. */
  rtems_task_priority swapout_worker_priority; /**< Priority of the swap out
                                                * task. */
  size_t              size;                    /**< Size of memory in the
                                                * cache */
  uint32_t            buffer_min;              /**< Minimum buffer size. */
  uint32_t            buffer_max;              /**< Maximum buffer size
                                                * supported. It is also the
                                                * allocation size. */
} rtems_bdbuf_config;

/**
 * External reference to the configuration.
 *
 * The configuration is provided by the application.
 */
extern const rtems_bdbuf_config rtems_bdbuf_configuration;

/**
 * The max_read_ahead_blocks value is altered if there are fewer buffers
 * than this defined max. This stops thrashing in the cache.
 */
#define RTEMS_BDBUF_MAX_READ_AHEAD_BLOCKS_DEFAULT    32

/**
 * Default maximum number of blocks to write at once.
 */
#define RTEMS_BDBUF_MAX_WRITE_BLOCKS_DEFAULT         16

/**
 * Default swap-out task priority.
 */
#define RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT    15

/**
 * Default swap-out task swap period in milli seconds.
 */
#define RTEMS_BDBUF_SWAPOUT_TASK_SWAP_PERIOD_DEFAULT 250

/**
 * Default swap-out task block hold time in milli seconds.
 */
#define RTEMS_BDBUF_SWAPOUT_TASK_BLOCK_HOLD_DEFAULT  1000

/**
 * Default swap-out worker tasks. Currently disabled.
 */
#define RTEMS_BDBUF_SWAPOUT_WORKER_TASKS_DEFAULT     0

/**
 * Default swap-out worker task priority. The same as the swapout task.
 */
#define RTEMS_BDBUF_SWAPOUT_WORKER_TASK_PRIORITY_DEFAULT \
                             RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT

/**
 * Default size of memory allocated to the cache.
 */
#define RTEMS_BDBUF_CACHE_MEMORY_SIZE_DEFAULT (64 * 512)

/**
 * Default minimum size of buffers.
 */
#define RTEMS_BDBUF_BUFFER_MIN_SIZE_DEFAULT (512)

/**
 * Default maximum size of buffers.
 */
#define RTEMS_BDBUF_BUFFER_MAX_SIZE_DEFAULT (4096)

/**
 * Prepare buffering layer to work - initialize buffer descritors and (if it is
 * neccessary) buffers. After initialization all blocks is placed into the
 * ready state.
 *
 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
 *         successfully or error code if error is occured)
 */
rtems_status_code
rtems_bdbuf_init (void);

/**
 * Get block buffer for data to be written into. The buffers is set to the
 * access or modifed access state. If the buffer is in the cache and modified
 * the state is access modified else the state is access. This buffer contents
 * are not initialised if the buffer is not already in the cache. If the block
 * is already resident in memory it is returned how-ever if not in memory the
 * buffer is not read from disk. This call is used when writing the whole block
 * on a disk rather than just changing a part of it. If there is no buffers
 * available this call will block. A buffer obtained with this call will not be
 * involved in a transfer request and will not be returned to another user
 * until released. If the buffer is already with a user when this call is made
 * the call is blocked until the buffer is returned. The highest priority
 * waiter will obtain the buffer first.
 *
 * The block number is the linear block number. This is relative to the start
 * of the partition on the media.
 *
 * @param device Device number (constructed of major and minor device number)
 * @param block  Linear media block number
 * @param bd     Reference to the buffer descriptor pointer.
 *
 * @return       RTEMS status code (RTEMS_SUCCESSFUL if operation completed
 *               successfully or error code if error is occured)
 */
rtems_status_code
rtems_bdbuf_get (dev_t device, rtems_blkdev_bnum block, rtems_bdbuf_buffer** bd);

/**
 * Get the block buffer and if not already in the cache read from the disk. If
 * specified block already cached return. The buffer is set to the access or
 * modifed access state. If the buffer is in the cache and modified the state
 * is access modified else the state is access. If block is already being read
 * from disk for being written to disk this call blocks. If the buffer is
 * waiting to be written it is removed from modified queue and returned to the
 * user. If the buffer is not in the cache a new buffer is obtained and the
 * data read from disk. The call may block until these operations complete. A
 * buffer obtained with this call will not be involved in a transfer request
 * and will not be returned to another user until released. If the buffer is
 * already with a user when this call is made the call is blocked until the
 * buffer is returned. The highest priority waiter will obtain the buffer
 * first. 
 *
 * @param device Device number (constructed of major and minor device number)
 * @param block  Linear media block number
 * @param bd     Reference to the buffer descriptor pointer.
 *
 * @return       RTEMS status code (RTEMS_SUCCESSFUL if operation completed
 *               successfully or error code if error is occured)
 */
rtems_status_code
rtems_bdbuf_read (dev_t device, rtems_blkdev_bnum block, rtems_bdbuf_buffer** bd);

/**
 * Release the buffer obtained by a read call back to the cache. If the buffer
 * was obtained by a get call and was not already in the cache the release
 * modified call should be used. A buffer released with this call obtained by a
 * get call may not be in sync with the contents on disk. If the buffer was in
 * the cache and modified before this call it will be returned to the modified
 * queue. The buffers is returned to the end of the LRU list.
 *
 * @param bd Reference to the buffer descriptor.
 *
 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
 *         successfully or error code if error is occured)
 */
rtems_status_code
rtems_bdbuf_release (rtems_bdbuf_buffer* bd);

/**
 * Release the buffer allocated with a get or read call placing it on the
 * modidied list.  If the buffer was not released modified before the hold
 * timer is set to the configuration value. If the buffer had been released
 * modified before but not written to disk the hold timer is not updated. The
 * buffer will be written to disk when the hold timer has expired, there are
 * not more buffers available in the cache and a get or read buffer needs one
 * or a sync call has been made. If the buffer is obtained with a get or read
 * before the hold timer has expired the buffer will be returned to the user.
 *
 * @param bd Reference to the buffer descriptor.
 *
 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
 *         successfully or error code if error is occured)
 */
rtems_status_code
rtems_bdbuf_release_modified (rtems_bdbuf_buffer* bd);

/**
 * Release the buffer as modified and wait until it has been synchronized with
 * the disk by writing it. This buffer will be the first to be transfer to disk
 * and other buffers may also be written if the maximum number of blocks in a
 * requests allows it.
 *
 * @note This code does not lock the sync mutex and stop additions to the
 *       modified queue.

 * @param bd Reference to the buffer descriptor.
 *
 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
 *         successfully or error code if error is occured)
 */
rtems_status_code
rtems_bdbuf_sync (rtems_bdbuf_buffer* bd);

/**
 * Synchronize all modified buffers for this device with the disk and wait
 * until the transfers have completed. The sync mutex for the cache is locked
 * stopping the addition of any further modifed buffers. It is only the
 * currently modified buffers that are written.
 *
 * @note Nesting calls to sync multiple devices will be handled sequentially. A
 * nested call will be blocked until the first sync request has complete.
 *
 * @param dev Block device number
 *
 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
 *         successfully or error code if error is occured)
 */
rtems_status_code
rtems_bdbuf_syncdev (dev_t dev);

/** @} */

#ifdef __cplusplus
}
#endif

#endif