From: Chuck Lever here's the meat of the NFS direct I/O changes. it touches just fs/nfs/direct.c (and one line of fs/nfs/file.c). it provides basic support for NFS O_DIRECT similar to what appears in 2.4, but it uses the 2.5 kernel's direct I/O APIs instead. another difference is that it invokes the NFS proto methods instead of assembling RPCs itself (this is allowed by the previous patches). this moves implementation details specific to each version of NFS out of this code. again, this has been put through only very basic tests, but is enough to present for comments. diff -Naurp 24-commit_proc/fs/nfs/direct.c 25-odirect/fs/nfs/direct.c fs/nfs/direct.c | 459 ++++++++++++++++++++++++++++++++++--------------- include/linux/nfs_fs.h | 2 2 files changed, 325 insertions(+), 136 deletions(-) diff -puN fs/nfs/direct.c~nfs-O_DIRECT fs/nfs/direct.c --- 25/fs/nfs/direct.c~nfs-O_DIRECT 2003-06-18 16:26:42.000000000 -0700 +++ 25-akpm/fs/nfs/direct.c 2003-06-18 16:26:42.000000000 -0700 @@ -1,7 +1,7 @@ /* * linux/fs/nfs/direct.c * - * Copyright (C) 2001 by Chuck Lever + * Copyright (C) 2003 by Chuck Lever * * High-performance uncached I/O for the Linux NFS client * @@ -26,19 +26,23 @@ * also supports uncaching whole NFS partitions with "-o forcedirectio," * an undocumented mount option. * - * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust. + * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with + * help from Andrew Morton. * * 18 Dec 2001 Initial implementation for 2.4 --cel * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy - * 24 Sep 2002 Rewrite to use asynchronous RPCs, port to 2.5 --cel + * 08 Jun 2003 Port to 2.5 APIs --cel * */ #include +#include #include #include +#include #include -#include +#include + #include #include #include @@ -46,35 +50,41 @@ #include #include -#define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS) +#define NFSDBG_FACILITY NFSDBG_VFS #define VERF_SIZE (2 * sizeof(__u32)) +#define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) /** - * nfs_get_user_pages - find and set up page representing user buffer - * addr: user-space address of target buffer - * size: total size in bytes of target buffer - * @pages: returned array of page struct pointers underlying target buffer - * write: whether or not buffer is target of a write operation + * nfs_get_user_pages - find and set up pages underlying user's buffer + * rw: direction (read or write) + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * @pages: returned array of page struct pointers underlying user's buffer */ static inline int -nfs_get_user_pages(unsigned long addr, size_t size, - struct page ***pages, int rw) +nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, + struct page ***pages) { int result = -ENOMEM; - unsigned page_count = (unsigned) size >> PAGE_SHIFT; - unsigned array_size = (page_count * sizeof(struct page *)) + 2U; + unsigned long page_count; + size_t array_size; - *pages = (struct page **) kmalloc(array_size, GFP_KERNEL); + /* set an arbitrary limit to prevent arithmetic overflow */ + if (size > MAX_DIRECTIO_SIZE) + return -EFBIG; + + page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT; + page_count -= user_addr >> PAGE_SHIFT; + + array_size = (page_count * sizeof(struct page *)); + *pages = kmalloc(array_size, GFP_KERNEL); if (*pages) { down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, addr, - page_count, (rw == WRITE), 0, + result = get_user_pages(current, current->mm, user_addr, + page_count, (rw == READ), 0, *pages, NULL); up_read(¤t->mm->mmap_sem); - if (result < 0) - printk(KERN_ERR "%s: get_user_pages result %d\n", - __FUNCTION__, result); } return result; } @@ -84,174 +94,353 @@ nfs_get_user_pages(unsigned long addr, s * @pages: array of page struct pointers underlying target buffer */ static inline void -nfs_free_user_pages(struct page **pages, unsigned count) +nfs_free_user_pages(struct page **pages) { - unsigned page = 0; + kfree(pages); +} - while (count--) - page_cache_release(pages[page++]); +/** + * nfs_direct_read_seg - Read in one iov segment. Generate separate + * read RPCs for each "rsize" bytes. + * @inode: target inode + * @cred: user's credential + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * file_offset: offset in file to begin the operation + * @pages: array of addresses of page structs defining user's buffer + * nr_pages: size of pages array + */ +static int +nfs_direct_read_seg(struct inode *inode, struct rpc_cred *cred, + unsigned long user_addr, size_t count, loff_t file_offset, + struct page **pages, int nr_pages) +{ + const unsigned int rsize = NFS_SERVER(inode)->rsize; + int tot_bytes = 0; + int curpage = 0; + struct nfs_read_data rdata = { + .flags = 0, + .cred = cred, + .inode = inode, + .args = { + .fh = NFS_FH(inode), + }, + .res = { + .fattr = &rdata.fattr, + }, + }; + + do { + int request, result; + + request = count; + if (count > rsize) + request = rsize; + rdata.args.count = request, + rdata.args.pgbase = user_addr & ~PAGE_MASK; + rdata.args.offset = file_offset; + rdata.args.pages = &pages[curpage]; + + dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", + rdata.args.count, rdata.args.offset, + user_addr, rdata.args.pgbase, curpage); + + lock_kernel(); + result = NFS_PROTO(inode)->read(&rdata); + unlock_kernel(); + + if (result < 0) { + if (result == -EISDIR) + result = -EINVAL; + return result; + } - kfree(pages); + tot_bytes += result; + count -= result; + file_offset += result; + user_addr += result; + + /* + * stay on the same page if we haven't gone farther + * than the page size. + */ + if ((rdata.args.pgbase + result) >= PAGE_SIZE) + curpage += (rdata.args.pgbase + result) >> PAGE_SHIFT; + } while (count); + + return tot_bytes; } /** - * nfs_iov2pagelist - convert an array of iovecs to a list of page requests - * @inode: inode of target file - * @cred: credentials of user who requested I/O + * nfs_direct_read - For each iov segment, map the user's buffer + * then generate read RPCs. + * @inode: target inode + * @cred: user's credential * @iov: array of vectors that define I/O buffer - * offset: where in file to begin the read + * file_offset: offset in file to begin the operation * nr_segs: size of iovec array - * @requests: append new page requests to this list head + * + * generic_file_direct_IO has already pushed out any non-direct + * writes so that this read will see them when we read from the + * server. */ static int -nfs_iov2pagelist(int rw, const struct inode *inode, - const struct rpc_cred *cred, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs, struct list_head *requests) +nfs_direct_read(struct inode *inode, struct rpc_cred *cred, + const struct iovec *iov, loff_t file_offset, + unsigned long nr_segs) { - unsigned seg; int tot_bytes = 0; - struct page **pages; + unsigned long seg = 0; + + while ((seg < nr_segs) && (tot_bytes >= 0)) { + int result, page_count; + struct page **pages; + const struct iovec *vec = &iov[seg++]; + unsigned long user_addr = (unsigned long) vec->iov_base; + size_t size = vec->iov_len; + + page_count = nfs_get_user_pages(READ, user_addr, size, &pages); + if (page_count < 0) { + nfs_free_user_pages(pages); + return page_count; + } - /* for each iovec in the array... */ - for (seg = 0; seg < nr_segs; seg++) { - const unsigned long user_addr = - (unsigned long) iov[seg].iov_base; - size_t bytes = iov[seg].iov_len; - unsigned int pg_offset = (user_addr & ~PAGE_MASK); - int page_count, page = 0; - - page_count = nfs_get_user_pages(user_addr, bytes, &pages, rw); - if (page_count < 0) { - return page_count; + result = nfs_direct_read_seg(inode, cred, user_addr, size, + file_offset, pages, page_count); + if (result < 0) + tot_bytes = result; + else { + tot_bytes += result; + file_offset += result; } - /* ...build as many page requests as required */ - while (bytes > 0) { - struct nfs_page *new; - const unsigned int pg_bytes = (bytes > PAGE_SIZE) ? - PAGE_SIZE : bytes; - - new = nfs_create_request((struct rpc_cred *) cred, - (struct inode *) inode, - pages[page], - pg_offset, pg_bytes); - if (IS_ERR(new)) { - nfs_free_user_pages(pages, page_count); - nfs_release_list(requests); - return PTR_ERR(new); - } - new->wb_index = offset; - nfs_list_add_request(new, requests); - - /* after the first page */ - pg_offset = 0; - offset += PAGE_SIZE; - tot_bytes += pg_bytes; - bytes -= pg_bytes; - page++; + nfs_free_user_pages(pages); + } + + return tot_bytes; +} + +/** + * nfs_direct_write_seg - Write out one iov segment. Generate separate + * write RPCs for each "wsize" bytes, then commit. + * @inode: target inode + * @cred: user's credential + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * file_offset: offset in file to begin the operation + * @pages: array of addresses of page structs defining user's buffer + * nr_pages: size of pages array + */ +static int +nfs_direct_write_seg(struct inode *inode, struct rpc_cred *cred, + unsigned long user_addr, size_t count, loff_t file_offset, + struct page **pages, int nr_pages) +{ + const unsigned int wsize = NFS_SERVER(inode)->wsize; + loff_t save_offset = file_offset; + size_t save_count = count; + int need_commit = 0; + int tot_bytes = 0; + int curpage = 0; + struct nfs_writeverf first_verf; + struct nfs_write_data wdata = { + .cred = cred, + .inode = inode, + .args = { + .fh = NFS_FH(inode), + }, + .res = { + .fattr = &wdata.fattr, + .verf = &wdata.verf, + }, + }; + + wdata.args.stable = NFS_UNSTABLE; + if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize) + wdata.args.stable = NFS_FILE_SYNC; + +retry: + do { + int request, result; + + request = count; + if (count > wsize) + request = wsize; + wdata.args.count = request, + wdata.args.pgbase = user_addr & ~PAGE_MASK; + wdata.args.offset = file_offset; + wdata.args.pages = &pages[curpage]; + + dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", + wdata.args.count, wdata.args.offset, + user_addr, wdata.args.pgbase, curpage); + + lock_kernel(); + result = NFS_PROTO(inode)->write(&wdata); + unlock_kernel(); + + if (result < 0) + return result; + + if (!tot_bytes) + memcpy(&first_verf.verifier, &wdata.verf.verifier, + VERF_SIZE); + if (wdata.verf.committed != NFS_FILE_SYNC) { + need_commit = 1; + if (memcmp(&first_verf.verifier, + &wdata.verf.verifier, VERF_SIZE)) + goto sync_retry; } - /* don't release pages here -- I/O completion will do that */ - nfs_free_user_pages(pages, 0); + tot_bytes += result; + count -= result; + file_offset += result; + user_addr += result; + + /* + * stay on the same page if we haven't gone farther + * than the page size. + */ + if ((wdata.args.pgbase + result) >= PAGE_SIZE) + curpage += (wdata.args.pgbase + result) >> PAGE_SHIFT; + } while (count); + + /* + * Commit data written so far, even in the event of an error + */ + if (need_commit) { + int result; + + wdata.args.count = tot_bytes; + wdata.args.offset = save_offset; + + lock_kernel(); + result = NFS_PROTO(inode)->commit(&wdata); + unlock_kernel(); + + if (result < 0) + goto sync_retry; + if (memcmp(&first_verf.verifier, &wdata.verf.verifier, + VERF_SIZE)) + goto sync_retry; } return tot_bytes; + +sync_retry: + wdata.args.stable = NFS_FILE_SYNC; + file_offset = save_offset; + count = save_count; + goto retry; } /** - * do_nfs_direct_IO - Read or write data without caching - * @inode: inode of target file - * @cred: credentials of user who requested I/O + * nfs_direct_write - For each iov segment, map the user's buffer + * then generate write and commit RPCs. + * @inode: target inode + * @cred: user's credential * @iov: array of vectors that define I/O buffer - * offset: where in file to begin the read + * file_offset: offset in file to begin the operation * nr_segs: size of iovec array * - * Break the passed-in iovec into a series of page-sized or smaller - * requests, where each page is mapped for direct user-land I/O. - * - * For each of these pages, create an NFS page request and - * append it to an automatic list of page requests. - * - * When all page requests have been queued, start the I/O on the - * whole list. The underlying routines coalesce the pages on the - * list into a bunch of asynchronous "r/wsize" network requests. - * - * I/O completion automatically unmaps and releases the pages. + * Upon return, generic_file_direct_IO invalidates any cached pages + * that non-direct readers might access, so they will pick up these + * writes immediately. */ static int -do_nfs_direct_IO(int rw, const struct inode *inode, - const struct rpc_cred *cred, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +nfs_direct_write(struct inode *inode, struct rpc_cred *cred, + const struct iovec *iov, loff_t file_offset, + unsigned long nr_segs) { - LIST_HEAD(requests); - int result, tot_bytes; + int tot_bytes = 0; + unsigned long seg = 0; - result = nfs_iov2pagelist(rw, inode, cred, iov, offset, nr_segs, - &requests); - if (result < 0) - return result; - tot_bytes = result; + while ((seg < nr_segs) && (tot_bytes >= 0)) { + int result, page_count; + struct page **pages; + const struct iovec *vec = &iov[seg++]; + unsigned long user_addr = (unsigned long) vec->iov_base; + size_t size = vec->iov_len; + + page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages); + if (page_count < 0) { + nfs_free_user_pages(pages); + return page_count; + } - switch (rw) { - case READ: - if (IS_SYNC(inode) || (NFS_SERVER(inode)->rsize < PAGE_SIZE)) { - result = nfs_direct_read_sync(inode, cred, iov, offset, nr_segs); - break; + result = nfs_direct_write_seg(inode, cred, user_addr, size, + file_offset, pages, page_count); + if (result < 0) + tot_bytes = result; + else { + tot_bytes += result; + file_offset += result; } - result = nfs_pagein_list(&requests, NFS_SERVER(inode)->rpages); - break; - case WRITE: - if (IS_SYNC(inode) || (NFS_SERVER(inode)->wsize < PAGE_SIZE)) - result = nfs_direct_write_sync(inode, cred, iov, offset, nr_segs); - else - result = nfs_flush_list(&requests, - NFS_SERVER(inode)->wpages, FLUSH_WAIT); - /* invalidate cache so non-direct readers pick up changes */ - invalidate_inode_pages((struct inode *) inode); - break; - default: - result = -EINVAL; - break; + nfs_free_user_pages(pages); } - if (result < 0) - return result; return tot_bytes; } /** * nfs_direct_IO - NFS address space operation for direct I/O * rw: direction (read or write) - * @file: file struct of target file + * @iocb: target I/O control block * @iov: array of vectors that define I/O buffer - * offset: offset in file to begin the operation + * file_offset: offset in file to begin the operation * nr_segs: size of iovec array * + * Usually a file system implements direct I/O by calling out to + * blockdev_direct_IO. The NFS client doesn't have a backing block + * device, so we do everything by hand instead. + * * The inode's i_sem is no longer held by the VFS layer before it calls * this function to do a write. */ int nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) + loff_t file_offset, unsigned long nr_segs) { - /* None of this works yet, so prevent it from compiling. */ -#if 0 - int result; + int result = -EINVAL; + struct file *file = iocb->ki_filp; struct dentry *dentry = file->f_dentry; - const struct inode *inode = dentry->d_inode->i_mapping->host; - const struct rpc_cred *cred = nfs_file_cred(file); -#endif - - dfprintk(VFS, "NFS: direct_IO(%s) (%s/%s) off/no(%Lu/%lu)\n", - ((rw == READ) ? "READ" : "WRITE"), - dentry->d_parent->d_name.name, - dentry->d_name.name, offset, nr_segs); + struct inode *inode = dentry->d_inode; + struct rpc_cred *cred; + + /* + * No support for async yet + */ + if (!is_sync_kiocb(iocb)) + goto out; + + cred = get_rpccred(nfs_file_cred(file)); + if (!cred) + cred = get_rpccred(NFS_I(inode)->mm_cred); + + switch (rw) { + case READ: + dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n", + dentry->d_name.name, file_offset, nr_segs); - result = do_nfs_direct_IO(rw, inode, cred, iov, offset, nr_segs); + result = nfs_direct_read(inode, cred, iov, + file_offset, nr_segs); + break; + case WRITE: + dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n", + dentry->d_name.name, file_offset, nr_segs); + + result = nfs_direct_write(inode, cred, iov, + file_offset, nr_segs); + break; + default: + break; + } - dfprintk(VFS, "NFS: direct_IO result = %d\n", result); + if (cred) + put_rpccred(cred); +out: + dprintk("NFS: direct_IO result=%d\n", result); return result; } diff -puN include/linux/nfs_fs.h~nfs-O_DIRECT include/linux/nfs_fs.h --- 25/include/linux/nfs_fs.h~nfs-O_DIRECT 2003-06-18 16:26:42.000000000 -0700 +++ 25-akpm/include/linux/nfs_fs.h 2003-06-18 16:26:42.000000000 -0700 @@ -269,7 +269,7 @@ nfs_file_cred(struct file *file) /* * linux/fs/nfs/direct.c */ -extern int nfs_direct_IO(int, struct file *, const struct iovec *, loff_t, +extern int nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t, unsigned long); /* _