diff -Nru a/fs/xfs/Makefile b/fs/xfs/Makefile --- a/fs/xfs/Makefile Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/Makefile Thu Jan 29 23:15:58 2004 @@ -30,7 +30,7 @@ # http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ # -EXTRA_CFLAGS += -Ifs/xfs -funsigned-char +EXTRA_CFLAGS += -Ifs/xfs -Ifs/xfs/linux -funsigned-char ifeq ($(CONFIG_XFS_DEBUG),y) EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG -DXFSDEBUG @@ -83,6 +83,7 @@ xfs_alloc_btree.o \ xfs_attr.o \ xfs_attr_leaf.o \ + xfs_behavior.o \ xfs_bit.o \ xfs_bmap.o \ xfs_bmap_btree.o \ @@ -106,6 +107,7 @@ xfs_inode.o \ xfs_inode_item.o \ xfs_iocore.o \ + xfs_iomap.o \ xfs_itable.o \ xfs_dfrag.o \ xfs_log.o \ @@ -126,18 +128,15 @@ xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o -# Objects in pagebuf/ -xfs-y += pagebuf/page_buf.o - # Objects in linux/ xfs-y += $(addprefix linux/, \ + mrlock.o \ xfs_aops.o \ - xfs_behavior.o \ + xfs_buf.o \ xfs_file.o \ xfs_fs_subr.o \ xfs_globals.o \ xfs_ioctl.o \ - xfs_iomap.o \ xfs_iops.o \ xfs_lrw.o \ xfs_super.o \ @@ -148,7 +147,6 @@ xfs-y += $(addprefix support/, \ debug.o \ move.o \ - mrlock.o \ qsort.o \ uuid.o) diff -Nru a/fs/xfs/linux/kmem.h b/fs/xfs/linux/kmem.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/linux/kmem.h Thu Jan 29 23:15:58 2004 @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_KMEM_H__ +#define __XFS_SUPPORT_KMEM_H__ + +#include +#include +#include +#include + +/* + * Cutoff point to use vmalloc instead of kmalloc. + */ +#define MAX_SLAB_SIZE 0x10000 + +/* + * XFS uses slightly different names for these due to the + * IRIX heritage. + */ +#define kmem_zone kmem_cache_s +#define kmem_zone_t kmem_cache_t + +#define KM_SLEEP 0x0001 +#define KM_NOSLEEP 0x0002 +#define KM_NOFS 0x0004 + +typedef unsigned long xfs_pflags_t; + +#define PFLAGS_TEST_FSTRANS() (current->flags & PF_FSTRANS) + +#define PFLAGS_SET_FSTRANS(STATEP) do { \ + *(STATEP) = current->flags; \ + current->flags |= PF_FSTRANS; \ +} while (0) + +#define PFLAGS_RESTORE(STATEP) do { \ + current->flags = *(STATEP); \ +} while (0) + +#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \ + *(NSTATEP) = *(OSTATEP); \ +} while (0) + +/* + * XXX get rid of the unconditional __GFP_NOFAIL by adding + * a KM_FAIL flag and using it where we're allowed to fail. + */ +static __inline unsigned int +kmem_flags_convert(int flags) +{ + int lflags; + +#if DEBUG + if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS))) { + printk(KERN_WARNING + "XFS: memory allocation with wrong flags (%x)\n", flags); + BUG(); + } +#endif + + lflags = (flags & KM_NOSLEEP) ? GFP_ATOMIC : (GFP_KERNEL|__GFP_NOFAIL); + + /* avoid recusive callbacks to filesystem during transactions */ + if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS)) + lflags &= ~__GFP_FS; + + return lflags; +} + +static __inline void * +kmem_alloc(size_t size, int flags) +{ + if (unlikely(MAX_SLAB_SIZE < size)) + /* Avoid doing filesystem sensitive stuff to get this */ + return __vmalloc(size, kmem_flags_convert(flags), PAGE_KERNEL); + return kmalloc(size, kmem_flags_convert(flags)); +} + +static __inline void * +kmem_zalloc(size_t size, int flags) +{ + void *ptr = kmem_alloc(size, flags); + if (likely(ptr != NULL)) + memset(ptr, 0, size); + return ptr; +} + +static __inline void +kmem_free(void *ptr, size_t size) +{ + if (unlikely((unsigned long)ptr < VMALLOC_START || + (unsigned long)ptr >= VMALLOC_END)) + kfree(ptr); + else + vfree(ptr); +} + +static __inline void * +kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags) +{ + void *new = kmem_alloc(newsize, flags); + + if (likely(ptr != NULL)) { + if (likely(new != NULL)) + memcpy(new, ptr, min(oldsize, newsize)); + kmem_free(ptr, oldsize); + } + + return new; +} + +static __inline kmem_zone_t * +kmem_zone_init(int size, char *zone_name) +{ + return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL); +} + +static __inline void * +kmem_zone_alloc(kmem_zone_t *zone, int flags) +{ + return kmem_cache_alloc(zone, kmem_flags_convert(flags)); +} + +static __inline void * +kmem_zone_zalloc(kmem_zone_t *zone, int flags) +{ + void *ptr = kmem_zone_alloc(zone, flags); + if (likely(ptr != NULL)) + memset(ptr, 0, kmem_cache_size(zone)); + return ptr; +} + +static __inline void +kmem_zone_free(kmem_zone_t *zone, void *ptr) +{ + kmem_cache_free(zone, ptr); +} + +typedef struct shrinker *kmem_shaker_t; +typedef int (*kmem_shake_func_t)(int, unsigned int); + +static __inline kmem_shaker_t +kmem_shake_register(kmem_shake_func_t sfunc) +{ + return set_shrinker(DEFAULT_SEEKS, sfunc); +} + +static __inline void +kmem_shake_deregister(kmem_shaker_t shrinker) +{ + remove_shrinker(shrinker); +} + +static __inline int +kmem_shake_allow(unsigned int gfp_mask) +{ + return (gfp_mask & __GFP_WAIT); +} + +#endif /* __XFS_SUPPORT_KMEM_H__ */ diff -Nru a/fs/xfs/linux/mrlock.c b/fs/xfs/linux/mrlock.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/linux/mrlock.c Thu Jan 29 23:15:58 2004 @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include +#include +#include + +#include "mrlock.h" + + +#if USE_RW_WAIT_QUEUE_SPINLOCK +# define wq_write_lock write_lock +#else +# define wq_write_lock spin_lock +#endif + +/* + * We don't seem to need lock_type (only one supported), name, or + * sequence. But, XFS will pass it so let's leave them here for now. + */ +/* ARGSUSED */ +void +mrlock_init(mrlock_t *mrp, int lock_type, char *name, long sequence) +{ + mrp->mr_count = 0; + mrp->mr_reads_waiting = 0; + mrp->mr_writes_waiting = 0; + init_waitqueue_head(&mrp->mr_readerq); + init_waitqueue_head(&mrp->mr_writerq); + mrp->mr_lock = SPIN_LOCK_UNLOCKED; +} + +/* + * Macros to lock/unlock the mrlock_t. + */ + +#define MRLOCK(m) spin_lock(&(m)->mr_lock); +#define MRUNLOCK(m) spin_unlock(&(m)->mr_lock); + + +/* + * lock_wait should never be called in an interrupt thread. + * + * mrlocks can sleep (i.e. call schedule) and so they can't ever + * be called from an interrupt thread. + * + * threads that wake-up should also never be invoked from interrupt threads. + * + * But, waitqueue_lock is locked from interrupt threads - and we are + * called with interrupts disabled, so it is all OK. + */ + +/* ARGSUSED */ +void +lock_wait(wait_queue_head_t *q, spinlock_t *lock, int rw) +{ + DECLARE_WAITQUEUE( wait, current ); + + __set_current_state(TASK_UNINTERRUPTIBLE); + + spin_lock(&q->lock); + if (rw) { + __add_wait_queue_tail(q, &wait); + } else { + __add_wait_queue(q, &wait); + } + + spin_unlock(&q->lock); + spin_unlock(lock); + + schedule(); + + spin_lock(&q->lock); + __remove_wait_queue(q, &wait); + spin_unlock(&q->lock); + + spin_lock(lock); + + /* return with lock held */ +} + +/* ARGSUSED */ +void +mrfree(mrlock_t *mrp) +{ +} + +/* ARGSUSED */ +void +mrlock(mrlock_t *mrp, int type, int flags) +{ + if (type == MR_ACCESS) + mraccess(mrp); + else + mrupdate(mrp); +} + +/* ARGSUSED */ +void +mraccessf(mrlock_t *mrp, int flags) +{ + MRLOCK(mrp); + if(mrp->mr_writes_waiting > 0) { + mrp->mr_reads_waiting++; + lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0); + mrp->mr_reads_waiting--; + } + while (mrp->mr_count < 0) { + mrp->mr_reads_waiting++; + lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0); + mrp->mr_reads_waiting--; + } + mrp->mr_count++; + MRUNLOCK(mrp); +} + +/* ARGSUSED */ +void +mrupdatef(mrlock_t *mrp, int flags) +{ + MRLOCK(mrp); + while(mrp->mr_count) { + mrp->mr_writes_waiting++; + lock_wait(&mrp->mr_writerq, &mrp->mr_lock, 1); + mrp->mr_writes_waiting--; + } + + mrp->mr_count = -1; /* writer on it */ + MRUNLOCK(mrp); +} + +int +mrtryaccess(mrlock_t *mrp) +{ + MRLOCK(mrp); + /* + * If anyone is waiting for update access or the lock is held for update + * fail the request. + */ + if(mrp->mr_writes_waiting > 0 || mrp->mr_count < 0) { + MRUNLOCK(mrp); + return 0; + } + mrp->mr_count++; + MRUNLOCK(mrp); + return 1; +} + +int +mrtrypromote(mrlock_t *mrp) +{ + MRLOCK(mrp); + + if(mrp->mr_count == 1) { /* We are the only thread with the lock */ + mrp->mr_count = -1; /* writer on it */ + MRUNLOCK(mrp); + return 1; + } + + MRUNLOCK(mrp); + return 0; +} + +int +mrtryupdate(mrlock_t *mrp) +{ + MRLOCK(mrp); + + if(mrp->mr_count) { + MRUNLOCK(mrp); + return 0; + } + + mrp->mr_count = -1; /* writer on it */ + MRUNLOCK(mrp); + return 1; +} + +static __inline__ void mrwake(mrlock_t *mrp) +{ + /* + * First, if the count is now 0, we need to wake-up anyone waiting. + */ + if (!mrp->mr_count) { + if (mrp->mr_writes_waiting) { /* Wake-up first writer waiting */ + wake_up(&mrp->mr_writerq); + } else if (mrp->mr_reads_waiting) { /* Wakeup any readers waiting */ + wake_up(&mrp->mr_readerq); + } + } +} + +void +mraccunlock(mrlock_t *mrp) +{ + MRLOCK(mrp); + mrp->mr_count--; + mrwake(mrp); + MRUNLOCK(mrp); +} + +void +mrunlock(mrlock_t *mrp) +{ + MRLOCK(mrp); + if (mrp->mr_count < 0) { + mrp->mr_count = 0; + } else { + mrp->mr_count--; + } + mrwake(mrp); + MRUNLOCK(mrp); +} + +int +ismrlocked(mrlock_t *mrp, int type) /* No need to lock since info can change */ +{ + if (type == MR_ACCESS) + return (mrp->mr_count > 0); /* Read lock */ + else if (type == MR_UPDATE) + return (mrp->mr_count < 0); /* Write lock */ + else if (type == (MR_UPDATE | MR_ACCESS)) + return (mrp->mr_count); /* Any type of lock held */ + else /* Any waiters */ + return (mrp->mr_reads_waiting | mrp->mr_writes_waiting); +} + +/* + * Demote from update to access. We better be the only thread with the + * lock in update mode so it should be easy to set to 1. + * Wake-up any readers waiting. + */ + +void +mrdemote(mrlock_t *mrp) +{ + MRLOCK(mrp); + mrp->mr_count = 1; + if (mrp->mr_reads_waiting) { /* Wakeup all readers waiting */ + wake_up(&mrp->mr_readerq); + } + MRUNLOCK(mrp); +} diff -Nru a/fs/xfs/linux/mrlock.h b/fs/xfs/linux/mrlock.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/linux/mrlock.h Thu Jan 29 23:15:58 2004 @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_MRLOCK_H__ +#define __XFS_SUPPORT_MRLOCK_H__ + +#include +#include +#include +#include + +/* + * Implement mrlocks on Linux that work for XFS. + * + * These are sleep locks and not spinlocks. If one wants read/write spinlocks, + * use read_lock, write_lock, ... see spinlock.h. + */ + +typedef struct mrlock_s { + int mr_count; + unsigned short mr_reads_waiting; + unsigned short mr_writes_waiting; + wait_queue_head_t mr_readerq; + wait_queue_head_t mr_writerq; + spinlock_t mr_lock; +} mrlock_t; + +#define MR_ACCESS 1 +#define MR_UPDATE 2 + +#define MRLOCK_BARRIER 0x1 +#define MRLOCK_ALLOW_EQUAL_PRI 0x8 + +/* + * mraccessf/mrupdatef take flags to be passed in while sleeping; + * only PLTWAIT is currently supported. + */ + +extern void mraccessf(mrlock_t *, int); +extern void mrupdatef(mrlock_t *, int); +extern void mrlock(mrlock_t *, int, int); +extern void mrunlock(mrlock_t *); +extern void mraccunlock(mrlock_t *); +extern int mrtryupdate(mrlock_t *); +extern int mrtryaccess(mrlock_t *); +extern int mrtrypromote(mrlock_t *); +extern void mrdemote(mrlock_t *); + +extern int ismrlocked(mrlock_t *, int); +extern void mrlock_init(mrlock_t *, int type, char *name, long sequence); +extern void mrfree(mrlock_t *); + +#define mrinit(mrp, name) mrlock_init(mrp, MRLOCK_BARRIER, name, -1) +#define mraccess(mrp) mraccessf(mrp, 0) /* grab for READ/ACCESS */ +#define mrupdate(mrp) mrupdatef(mrp, 0) /* grab for WRITE/UPDATE */ +#define mrislocked_access(mrp) ((mrp)->mr_count > 0) +#define mrislocked_update(mrp) ((mrp)->mr_count < 0) + +#endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff -Nru a/fs/xfs/linux/mutex.h b/fs/xfs/linux/mutex.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/linux/mutex.h Thu Jan 29 23:15:58 2004 @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_MUTEX_H__ +#define __XFS_SUPPORT_MUTEX_H__ + +#include +#include + +/* + * Map the mutex'es from IRIX to Linux semaphores. + * + * Destroy just simply initializes to -99 which should block all other + * callers. + */ +#define MUTEX_DEFAULT 0x0 +typedef struct semaphore mutex_t; + +#define mutex_init(lock, type, name) sema_init(lock, 1) +#define mutex_destroy(lock) sema_init(lock, -99) +#define mutex_lock(lock, num) down(lock) +#define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1) +#define mutex_unlock(lock) up(lock) + +#endif /* __XFS_SUPPORT_MUTEX_H__ */ diff -Nru a/fs/xfs/linux/sema.h b/fs/xfs/linux/sema.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/linux/sema.h Thu Jan 29 23:15:58 2004 @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SEMA_H__ +#define __XFS_SUPPORT_SEMA_H__ + +#include +#include +#include +#include + +/* + * sema_t structure just maps to struct semaphore in Linux kernel. + */ + +typedef struct semaphore sema_t; + +#define init_sema(sp, val, c, d) sema_init(sp, val) +#define initsema(sp, val) sema_init(sp, val) +#define initnsema(sp, val, name) sema_init(sp, val) +#define psema(sp, b) down(sp) +#define vsema(sp) up(sp) +#define valusema(sp) (atomic_read(&(sp)->count)) +#define freesema(sema) + +/* + * Map cpsema (try to get the sema) to down_trylock. We need to switch + * the return values since cpsema returns 1 (acquired) 0 (failed) and + * down_trylock returns the reverse 0 (acquired) 1 (failed). + */ + +#define cpsema(sp) (down_trylock(sp) ? 0 : 1) + +/* + * Didn't do cvsema(sp). Not sure how to map this to up/down/... + * It does a vsema if the values is < 0 other wise nothing. + */ + +#endif /* __XFS_SUPPORT_SEMA_H__ */ diff -Nru a/fs/xfs/linux/spin.h b/fs/xfs/linux/spin.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/linux/spin.h Thu Jan 29 23:15:58 2004 @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SPIN_H__ +#define __XFS_SUPPORT_SPIN_H__ + +#include /* preempt needs this */ +#include + +/* + * Map lock_t from IRIX to Linux spinlocks. + * + * Note that linux turns on/off spinlocks depending on CONFIG_SMP. + * We don't need to worry about SMP or not here. + */ + +#define SPLDECL(s) unsigned long s + +typedef spinlock_t lock_t; + +#define spinlock_init(lock, name) spin_lock_init(lock) +#define spinlock_destroy(lock) + +static inline unsigned long mutex_spinlock(lock_t *lock) +{ + spin_lock(lock); + return 0; +} + +/*ARGSUSED*/ +static inline void mutex_spinunlock(lock_t *lock, unsigned long s) +{ + spin_unlock(lock); +} + +static inline void nested_spinlock(lock_t *lock) +{ + spin_lock(lock); +} + +static inline void nested_spinunlock(lock_t *lock) +{ + spin_unlock(lock); +} + +#endif /* __XFS_SUPPORT_SPIN_H__ */ diff -Nru a/fs/xfs/linux/sv.h b/fs/xfs/linux/sv.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/linux/sv.h Thu Jan 29 23:15:58 2004 @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SV_H__ +#define __XFS_SUPPORT_SV_H__ + +#include +#include +#include + +/* + * Synchronisation variables. + * + * (Parameters "pri", "svf" and "rts" are not implemented) + */ + +typedef struct sv_s { + wait_queue_head_t waiters; +} sv_t; + +#define SV_FIFO 0x0 /* sv_t is FIFO type */ +#define SV_LIFO 0x2 /* sv_t is LIFO type */ +#define SV_PRIO 0x4 /* sv_t is PRIO type */ +#define SV_KEYED 0x6 /* sv_t is KEYED type */ +#define SV_DEFAULT SV_FIFO + + +static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state, + unsigned long timeout) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&sv->waiters, &wait); + __set_current_state(state); + spin_unlock(lock); + + schedule_timeout(timeout); + + remove_wait_queue(&sv->waiters, &wait); +} + +#define init_sv(sv,type,name,flag) \ + init_waitqueue_head(&(sv)->waiters) +#define sv_init(sv,flag,name) \ + init_waitqueue_head(&(sv)->waiters) +#define sv_destroy(sv) \ + /*NOTHING*/ +#define sv_wait(sv, pri, lock, s) \ + _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) +#define sv_wait_sig(sv, pri, lock, s) \ + _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) +#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \ + _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts)) +#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \ + _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts)) +#define sv_signal(sv) \ + wake_up(&(sv)->waiters) +#define sv_broadcast(sv) \ + wake_up_all(&(sv)->waiters) + +#endif /* __XFS_SUPPORT_SV_H__ */ diff -Nru a/fs/xfs/linux/time.h b/fs/xfs/linux/time.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/linux/time.h Thu Jan 29 23:15:58 2004 @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_TIME_H__ +#define __XFS_SUPPORT_TIME_H__ + +#include +#include + +typedef struct timespec timespec_t; + +static inline void delay(long ticks) +{ + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(ticks); +} + +static inline void nanotime(struct timespec *tvp) +{ + *tvp = CURRENT_TIME; +} + +#endif /* __XFS_SUPPORT_TIME_H__ */ diff -Nru a/fs/xfs/linux/xfs_behavior.c b/fs/xfs/linux/xfs_behavior.c --- a/fs/xfs/linux/xfs_behavior.c Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,218 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - * - */ -#include "xfs.h" - -/* - * Source file used to associate/disassociate behaviors with virtualized - * objects. See xfs_behavior.h for more information about behaviors, etc. - * - * The implementation is split between functions in this file and macros - * in xfs_behavior.h. - */ - -/* - * Insert a new behavior descriptor into a behavior chain. - * - * The behavior chain is ordered based on the 'position' number which - * lives in the first field of the ops vector (higher numbers first). - * - * Attemps to insert duplicate ops result in an EINVAL return code. - * Otherwise, return 0 to indicate success. - */ -int -bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp) -{ - bhv_desc_t *curdesc, *prev; - int position; - - /* - * Validate the position value of the new behavior. - */ - position = BHV_POSITION(bdp); - ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP); - - /* - * Find location to insert behavior. Check for duplicates. - */ - prev = NULL; - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - /* Check for duplication. */ - if (curdesc->bd_ops == bdp->bd_ops) { - ASSERT(0); - return EINVAL; - } - - /* Find correct position */ - if (position >= BHV_POSITION(curdesc)) { - ASSERT(position != BHV_POSITION(curdesc)); - break; /* found it */ - } - - prev = curdesc; - } - - if (prev == NULL) { - /* insert at front of chain */ - bdp->bd_next = bhp->bh_first; - bhp->bh_first = bdp; - } else { - /* insert after prev */ - bdp->bd_next = prev->bd_next; - prev->bd_next = bdp; - } - - return 0; -} - -/* - * Remove a behavior descriptor from a position in a behavior chain; - * the postition is guaranteed not to be the first position. - * Should only be called by the bhv_remove() macro. - */ -void -bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp) -{ - bhv_desc_t *curdesc, *prev; - - ASSERT(bhp->bh_first != NULL); - ASSERT(bhp->bh_first->bd_next != NULL); - - prev = bhp->bh_first; - for (curdesc = bhp->bh_first->bd_next; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - if (curdesc == bdp) - break; /* found it */ - prev = curdesc; - } - - ASSERT(curdesc == bdp); - prev->bd_next = bdp->bd_next; /* remove from after prev */ -} - -/* - * Look for a specific ops vector on the specified behavior chain. - * Return the associated behavior descriptor. Or NULL, if not found. - */ -bhv_desc_t * -bhv_lookup(bhv_head_t *bhp, void *ops) -{ - bhv_desc_t *curdesc; - - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - if (curdesc->bd_ops == ops) - return curdesc; - } - - return NULL; -} - -/* - * Looks for the first behavior within a specified range of positions. - * Return the associated behavior descriptor. Or NULL, if none found. - */ -bhv_desc_t * -bhv_lookup_range(bhv_head_t *bhp, int low, int high) -{ - bhv_desc_t *curdesc; - - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - int position = BHV_POSITION(curdesc); - - if (position <= high) { - if (position >= low) - return curdesc; - return NULL; - } - } - - return NULL; -} - -/* - * Return the base behavior in the chain, or NULL if the chain - * is empty. - * - * The caller has not read locked the behavior chain, so acquire the - * lock before traversing the chain. - */ -bhv_desc_t * -bhv_base(bhv_head_t *bhp) -{ - bhv_desc_t *curdesc; - - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - if (curdesc->bd_next == NULL) { - return curdesc; - } - } - - return NULL; -} - -void -bhv_head_init( - bhv_head_t *bhp, - char *name) -{ - bhp->bh_first = NULL; -} - -void -bhv_insert_initial( - bhv_head_t *bhp, - bhv_desc_t *bdp) -{ - ASSERT(bhp->bh_first == NULL); - (bhp)->bh_first = bdp; -} - -void -bhv_head_destroy( - bhv_head_t *bhp) -{ - ASSERT(bhp->bh_first == NULL); -} diff -Nru a/fs/xfs/linux/xfs_behavior.h b/fs/xfs/linux/xfs_behavior.h --- a/fs/xfs/linux/xfs_behavior.h Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,204 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ -#ifndef __XFS_BEHAVIOR_H__ -#define __XFS_BEHAVIOR_H__ - -/* - * Header file used to associate behaviors with virtualized objects. - * - * A virtualized object is an internal, virtualized representation of - * OS entities such as persistent files, processes, or sockets. Examples - * of virtualized objects include vnodes, vprocs, and vsockets. Often - * a virtualized object is referred to simply as an "object." - * - * A behavior is essentially an implementation layer associated with - * an object. Multiple behaviors for an object are chained together, - * the order of chaining determining the order of invocation. Each - * behavior of a given object implements the same set of interfaces - * (e.g., the VOP interfaces). - * - * Behaviors may be dynamically inserted into an object's behavior chain, - * such that the addition is transparent to consumers that already have - * references to the object. Typically, a given behavior will be inserted - * at a particular location in the behavior chain. Insertion of new - * behaviors is synchronized with operations-in-progress (oip's) so that - * the oip's always see a consistent view of the chain. - * - * The term "interpostion" is used to refer to the act of inserting - * a behavior such that it interposes on (i.e., is inserted in front - * of) a particular other behavior. A key example of this is when a - * system implementing distributed single system image wishes to - * interpose a distribution layer (providing distributed coherency) - * in front of an object that is otherwise only accessed locally. - * - * Note that the traditional vnode/inode combination is simply a virtualized - * object that has exactly one associated behavior. - * - * Behavior synchronization is logic which is necessary under certain - * circumstances that there is no conflict between ongoing operations - * traversing the behavior chain and those dunamically modifying the - * behavior chain. Because behavior synchronization adds extra overhead - * to virtual operation invocation, we want to restrict, as much as - * we can, the requirement for this extra code, to those situations - * in which it is truly necessary. - * - * Behavior synchronization is needed whenever there's at least one class - * of object in the system for which: - * 1) multiple behaviors for a given object are supported, - * -- AND -- - * 2a) insertion of a new behavior can happen dynamically at any time during - * the life of an active object, - * -- AND -- - * 3a) insertion of a new behavior needs to synchronize with existing - * ops-in-progress. - * -- OR -- - * 3b) multiple different behaviors can be dynamically inserted at - * any time during the life of an active object - * -- OR -- - * 3c) removal of a behavior can occur at any time during the life of - * an active object. - * -- OR -- - * 2b) removal of a behavior can occur at any time during the life of an - * active object - * - */ - -struct bhv_head_lock; - -/* - * Behavior head. Head of the chain of behaviors. - * Contained within each virtualized object data structure. - */ -typedef struct bhv_head { - struct bhv_desc *bh_first; /* first behavior in chain */ - struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */ -} bhv_head_t; - -/* - * Behavior descriptor. Descriptor associated with each behavior. - * Contained within the behavior's private data structure. - */ -typedef struct bhv_desc { - void *bd_pdata; /* private data for this behavior */ - void *bd_vobj; /* virtual object associated with */ - void *bd_ops; /* ops for this behavior */ - struct bhv_desc *bd_next; /* next behavior in chain */ -} bhv_desc_t; - -/* - * Behavior identity field. A behavior's identity determines the position - * where it lives within a behavior chain, and it's always the first field - * of the behavior's ops vector. The optional id field further identifies the - * subsystem responsible for the behavior. - */ -typedef struct bhv_identity { - __u16 bi_id; /* owning subsystem id */ - __u16 bi_position; /* position in chain */ -} bhv_identity_t; - -typedef bhv_identity_t bhv_position_t; - -#define BHV_IDENTITY_INIT(id,pos) {id, pos} -#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos) - -/* - * Define boundaries of position values. - */ -#define BHV_POSITION_INVALID 0 /* invalid position number */ -#define BHV_POSITION_BASE 1 /* base (last) implementation layer */ -#define BHV_POSITION_TOP 63 /* top (first) implementation layer */ - -/* - * Plumbing macros. - */ -#define BHV_HEAD_FIRST(bhp) (ASSERT((bhp)->bh_first), (bhp)->bh_first) -#define BHV_NEXT(bdp) (ASSERT((bdp)->bd_next), (bdp)->bd_next) -#define BHV_NEXTNULL(bdp) ((bdp)->bd_next) -#define BHV_VOBJ(bdp) (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj) -#define BHV_VOBJNULL(bdp) ((bdp)->bd_vobj) -#define BHV_PDATA(bdp) (bdp)->bd_pdata -#define BHV_OPS(bdp) (bdp)->bd_ops -#define BHV_IDENTITY(bdp) ((bhv_identity_t *)(bdp)->bd_ops) -#define BHV_POSITION(bdp) (BHV_IDENTITY(bdp)->bi_position) - -extern void bhv_head_init(bhv_head_t *, char *); -extern void bhv_head_destroy(bhv_head_t *); -extern int bhv_insert(bhv_head_t *, bhv_desc_t *); -extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *); - -/* - * Initialize a new behavior descriptor. - * Arguments: - * bdp - pointer to behavior descriptor - * pdata - pointer to behavior's private data - * vobj - pointer to associated virtual object - * ops - pointer to ops for this behavior - */ -#define bhv_desc_init(bdp, pdata, vobj, ops) \ - { \ - (bdp)->bd_pdata = pdata; \ - (bdp)->bd_vobj = vobj; \ - (bdp)->bd_ops = ops; \ - (bdp)->bd_next = NULL; \ - } - -/* - * Remove a behavior descriptor from a behavior chain. - */ -#define bhv_remove(bhp, bdp) \ - { \ - if ((bhp)->bh_first == (bdp)) { \ - /* \ - * Remove from front of chain. \ - * Atomic wrt oip's. \ - */ \ - (bhp)->bh_first = (bdp)->bd_next; \ - } else { \ - /* remove from non-front of chain */ \ - bhv_remove_not_first(bhp, bdp); \ - } \ - (bdp)->bd_vobj = NULL; \ - } - -/* - * Behavior module prototypes. - */ -extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp); -extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops); -extern bhv_desc_t * bhv_lookup_range(bhv_head_t *bhp, int low, int high); -extern bhv_desc_t * bhv_base(bhv_head_t *bhp); - -/* No bhv locking on Linux */ -#define bhv_lookup_unlocked bhv_lookup -#define bhv_base_unlocked bhv_base - -#endif /* __XFS_BEHAVIOR_H__ */ diff -Nru a/fs/xfs/linux/xfs_buf.c b/fs/xfs/linux/xfs_buf.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/linux/xfs_buf.c Thu Jan 29 23:15:58 2004 @@ -0,0 +1,2058 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * page_buf.c + * + * The page_buf module provides an abstract buffer cache model on top of + * the Linux page cache. Cached metadata blocks for a file system are + * hashed to the inode for the block device. The page_buf module + * assembles buffer (page_buf_t) objects on demand to aggregate such + * cached pages for I/O. + * + * + * Written by Steve Lord, Jim Mostek, Russell Cattelan + * and Rajagopal Ananthanarayanan ("ananth") at SGI. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "kmem.h" + +#include "xfs_types.h" +#include "xfs_cred.h" +#include "xfs_lrw.h" +#include "xfs_buf.h" + +#define BBSHIFT 9 +#define BN_ALIGN_MASK ((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1) + +#ifndef GFP_READAHEAD +#define GFP_READAHEAD (__GFP_NOWARN|__GFP_NORETRY) +#endif + +/* + * File wide globals + */ + +STATIC kmem_cache_t *pagebuf_cache; +STATIC void pagebuf_daemon_wakeup(void); +STATIC void pagebuf_delwri_queue(page_buf_t *, int); +STATIC struct workqueue_struct *pagebuf_logio_workqueue; +STATIC struct workqueue_struct *pagebuf_dataio_workqueue; + +/* + * Pagebuf module configuration parameters, exported via + * /proc/sys/vm/pagebuf + */ + +typedef struct pb_sysctl_val { + int min; + int val; + int max; +} pb_sysctl_val_t; + +struct { + pb_sysctl_val_t flush_interval; /* interval between runs of the + * delwri flush daemon. */ + pb_sysctl_val_t age_buffer; /* time for buffer to age before + * we flush it. */ + pb_sysctl_val_t stats_clear; /* clear the pagebuf stats */ + pb_sysctl_val_t debug; /* debug tracing on or off */ +} pb_params = { + /* MIN DFLT MAX */ + .flush_interval = { HZ/2, HZ, 30*HZ }, + .age_buffer = { 1*HZ, 15*HZ, 300*HZ }, + .stats_clear = { 0, 0, 1 }, + .debug = { 0, 0, 1 }, +}; + +enum { + PB_FLUSH_INT = 1, + PB_FLUSH_AGE = 2, + PB_STATS_CLEAR = 3, + PB_DEBUG = 4, +}; + +/* + * Pagebuf statistics variables + */ + +struct pbstats { + u_int32_t pb_get; + u_int32_t pb_create; + u_int32_t pb_get_locked; + u_int32_t pb_get_locked_waited; + u_int32_t pb_busy_locked; + u_int32_t pb_miss_locked; + u_int32_t pb_page_retries; + u_int32_t pb_page_found; + u_int32_t pb_get_read; +} pbstats; +DEFINE_PER_CPU(struct pbstats, pbstats); + +/* We don't disable preempt, not too worried about poking the + * wrong cpu's stat for now */ +#define PB_STATS_INC(count) (__get_cpu_var(pbstats).count++) + +/* + * Pagebuf debugging + */ + +#ifdef PAGEBUF_TRACE +void +pagebuf_trace( + page_buf_t *pb, + char *id, + void *data, + void *ra) +{ + if (!pb_params.debug.val) + return; + ktrace_enter(pagebuf_trace_buf, + pb, id, + (void *)(unsigned long)pb->pb_flags, + (void *)(unsigned long)pb->pb_hold.counter, + (void *)(unsigned long)pb->pb_sema.count.counter, + (void *)current, + data, ra, + (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff), + (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff), + (void *)(unsigned long)pb->pb_buffer_length, + NULL, NULL, NULL, NULL, NULL); +} +ktrace_t *pagebuf_trace_buf; +#define PAGEBUF_TRACE_SIZE 4096 +#define PB_TRACE(pb, id, data) \ + pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0)) +#else +#define PB_TRACE(pb, id, data) do { } while (0) +#endif + +#ifdef PAGEBUF_LOCK_TRACKING +# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid) +# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1) +# define PB_GET_OWNER(pb) ((pb)->pb_last_holder) +#else +# define PB_SET_OWNER(pb) do { } while (0) +# define PB_CLEAR_OWNER(pb) do { } while (0) +# define PB_GET_OWNER(pb) do { } while (0) +#endif + +/* + * Pagebuf allocation / freeing. + */ + +#define pb_to_gfp(flags) \ + (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \ + ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) + +#define pb_to_km(flags) \ + (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) + + +#define pagebuf_allocate(flags) \ + kmem_zone_alloc(pagebuf_cache, pb_to_km(flags)) +#define pagebuf_deallocate(pb) \ + kmem_zone_free(pagebuf_cache, (pb)); + +/* + * Pagebuf hashing + */ + +#define NBITS 8 +#define NHASH (1<pb_hash_index] + +STATIC int +_bhash( + struct block_device *bdev, + loff_t base) +{ + int bit, hval; + + base >>= 9; + base ^= (unsigned long)bdev / L1_CACHE_BYTES; + for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) { + hval ^= (int)base & (NHASH-1); + base >>= NBITS; + } + return hval; +} + +/* + * Mapping of multi-page buffers into contiguous virtual space + */ + +STATIC void *pagebuf_mapout_locked(page_buf_t *); + +typedef struct a_list { + void *vm_addr; + struct a_list *next; +} a_list_t; + +STATIC a_list_t *as_free_head; +STATIC int as_list_len; +STATIC spinlock_t as_lock = SPIN_LOCK_UNLOCKED; + +/* + * Try to batch vunmaps because they are costly. + */ +STATIC void +free_address( + void *addr) +{ + a_list_t *aentry; + + aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC); + if (aentry) { + spin_lock(&as_lock); + aentry->next = as_free_head; + aentry->vm_addr = addr; + as_free_head = aentry; + as_list_len++; + spin_unlock(&as_lock); + } else { + vunmap(addr); + } +} + +STATIC void +purge_addresses(void) +{ + a_list_t *aentry, *old; + + if (as_free_head == NULL) + return; + + spin_lock(&as_lock); + aentry = as_free_head; + as_free_head = NULL; + as_list_len = 0; + spin_unlock(&as_lock); + + while ((old = aentry) != NULL) { + vunmap(aentry->vm_addr); + aentry = aentry->next; + kfree(old); + } +} + +/* + * Internal pagebuf object manipulation + */ + +STATIC void +_pagebuf_initialize( + page_buf_t *pb, + pb_target_t *target, + loff_t range_base, + size_t range_length, + page_buf_flags_t flags) +{ + /* + * We don't want certain flags to appear in pb->pb_flags. + */ + flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD); + + memset(pb, 0, sizeof(page_buf_t)); + atomic_set(&pb->pb_hold, 1); + init_MUTEX_LOCKED(&pb->pb_iodonesema); + INIT_LIST_HEAD(&pb->pb_list); + INIT_LIST_HEAD(&pb->pb_hash_list); + init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */ + PB_SET_OWNER(pb); + pb->pb_target = target; + pb->pb_file_offset = range_base; + /* + * Set buffer_length and count_desired to the same value initially. + * IO routines should use count_desired, which will be the same in + * most cases but may be reset (e.g. XFS recovery). + */ + pb->pb_buffer_length = pb->pb_count_desired = range_length; + pb->pb_flags = flags | PBF_NONE; + pb->pb_bn = PAGE_BUF_DADDR_NULL; + atomic_set(&pb->pb_pin_count, 0); + init_waitqueue_head(&pb->pb_waiters); + + PB_STATS_INC(pb_create); + PB_TRACE(pb, "initialize", target); +} + +/* + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. + */ +STATIC int +_pagebuf_get_pages( + page_buf_t *pb, + int page_count, + page_buf_flags_t flags) +{ + int gpf_mask = pb_to_gfp(flags); + + /* Make sure that we have a page list */ + if (pb->pb_pages == NULL) { + pb->pb_offset = page_buf_poff(pb->pb_file_offset); + pb->pb_page_count = page_count; + if (page_count <= PB_PAGES) { + pb->pb_pages = pb->pb_page_array; + } else { + pb->pb_pages = kmalloc(sizeof(struct page *) * + page_count, gpf_mask); + if (pb->pb_pages == NULL) + return -ENOMEM; + } + memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); + } + return 0; +} + +/* + * Walk a pagebuf releasing all the pages contained within it. + */ +STATIC inline void +_pagebuf_freepages( + page_buf_t *pb) +{ + int buf_index; + + for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) { + struct page *page = pb->pb_pages[buf_index]; + + if (page) { + pb->pb_pages[buf_index] = NULL; + page_cache_release(page); + } + } +} + +/* + * pagebuf_free + * + * pagebuf_free releases the specified buffer. The modification + * state of any associated pages is left unchanged. + */ +void +pagebuf_free( + page_buf_t *pb) +{ + PB_TRACE(pb, "free", 0); + + if (pb->pb_flags & _PBF_LOCKABLE) { + pb_hash_t *hash = pb_hash(pb); + + spin_lock(&hash->pb_hash_lock); + /* + * Someone grabbed a reference while we weren't looking, + * try again later. + */ + if (unlikely(atomic_read(&pb->pb_hold))) { + spin_unlock(&hash->pb_hash_lock); + return; + } else if (!list_empty(&pb->pb_hash_list)) + list_del_init(&pb->pb_hash_list); + spin_unlock(&hash->pb_hash_lock); + } + + /* release any virtual mapping */ ; + if (pb->pb_flags & _PBF_ADDR_ALLOCATED) { + void *vaddr = pagebuf_mapout_locked(pb); + if (vaddr) { + free_address(vaddr); + } + } + + if (pb->pb_flags & _PBF_MEM_ALLOCATED) { + if (pb->pb_pages) { + /* release the pages in the address list */ + if ((pb->pb_pages[0]) && + (pb->pb_flags & _PBF_MEM_SLAB)) { + kfree(pb->pb_addr); + } else { + _pagebuf_freepages(pb); + } + if (pb->pb_pages != pb->pb_page_array) + kfree(pb->pb_pages); + pb->pb_pages = NULL; + } + pb->pb_flags &= ~(_PBF_MEM_ALLOCATED|_PBF_MEM_SLAB); + } + + pagebuf_deallocate(pb); +} + +/* + * _pagebuf_lookup_pages + * + * _pagebuf_lookup_pages finds all pages which match the buffer + * in question and the range of file offsets supplied, + * and builds the page list for the buffer, if the + * page list is not already formed or if not all of the pages are + * already in the list. Invalid pages (pages which have not yet been + * read in from disk) are assigned for any pages which are not found. + */ +STATIC int +_pagebuf_lookup_pages( + page_buf_t *pb, + struct address_space *aspace, + page_buf_flags_t flags) +{ + loff_t next_buffer_offset; + unsigned long page_count, pi, index; + struct page *page; + int gfp_mask, retry_count = 5, rval = 0; + int all_mapped, good_pages, nbytes; + unsigned int blocksize, sectorshift; + size_t size, offset; + + + /* For pagebufs where we want to map an address, do not use + * highmem pages - so that we do not need to use kmap resources + * to access the data. + * + * For pages where the caller has indicated there may be resource + * contention (e.g. called from a transaction) do not flush + * delalloc pages to obtain memory. + */ + + if (flags & PBF_READ_AHEAD) { + gfp_mask = GFP_READAHEAD; + retry_count = 0; + } else if (flags & PBF_DONT_BLOCK) { + gfp_mask = GFP_NOFS; + } else if (flags & PBF_MAPPABLE) { + gfp_mask = GFP_KERNEL; + } else { + gfp_mask = GFP_HIGHUSER; + } + + next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length; + + good_pages = page_count = (page_buf_btoc(next_buffer_offset) - + page_buf_btoct(pb->pb_file_offset)); + + if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) { + /* Bring pages forward in cache */ + for (pi = 0; pi < page_count; pi++) { + mark_page_accessed(pb->pb_pages[pi]); + } + if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) { + all_mapped = 1; + goto mapit; + } + return 0; + } + + /* Ensure pb_pages field has been initialised */ + rval = _pagebuf_get_pages(pb, page_count, flags); + if (rval) + return rval; + + rval = pi = 0; + blocksize = pb->pb_target->pbr_bsize; + sectorshift = pb->pb_target->pbr_sshift; + size = pb->pb_count_desired; + offset = pb->pb_offset; + + /* Enter the pages in the page list */ + index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT; + for (all_mapped = 1; pi < page_count; pi++, index++) { + if (pb->pb_pages[pi] == 0) { + retry: + page = find_or_create_page(aspace, index, gfp_mask); + if (!page) { + if (--retry_count > 0) { + PB_STATS_INC(pb_page_retries); + pagebuf_daemon_wakeup(); + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(10); + goto retry; + } + rval = -ENOMEM; + all_mapped = 0; + continue; + } + PB_STATS_INC(pb_page_found); + mark_page_accessed(page); + pb->pb_pages[pi] = page; + } else { + page = pb->pb_pages[pi]; + lock_page(page); + } + + nbytes = PAGE_CACHE_SIZE - offset; + if (nbytes > size) + nbytes = size; + size -= nbytes; + + if (!PageUptodate(page)) { + if (blocksize == PAGE_CACHE_SIZE) { + if (flags & PBF_READ) + pb->pb_locked = 1; + good_pages--; + } else if (!PagePrivate(page)) { + unsigned long i, range; + + /* + * In this case page->private holds a bitmap + * of uptodate sectors within the page + */ + ASSERT(blocksize < PAGE_CACHE_SIZE); + range = (offset + nbytes) >> sectorshift; + for (i = offset >> sectorshift; i < range; i++) + if (!test_bit(i, &page->private)) + break; + if (i != range) + good_pages--; + } else { + good_pages--; + } + } + offset = 0; + } + + if (!pb->pb_locked) { + for (pi = 0; pi < page_count; pi++) { + if (pb->pb_pages[pi]) + unlock_page(pb->pb_pages[pi]); + } + } + +mapit: + pb->pb_flags |= _PBF_MEM_ALLOCATED; + if (all_mapped) { + pb->pb_flags |= _PBF_ALL_PAGES_MAPPED; + + /* A single page buffer is always mappable */ + if (page_count == 1) { + pb->pb_addr = (caddr_t) + page_address(pb->pb_pages[0]) + pb->pb_offset; + pb->pb_flags |= PBF_MAPPED; + } else if (flags & PBF_MAPPED) { + if (as_list_len > 64) + purge_addresses(); + pb->pb_addr = vmap(pb->pb_pages, page_count, + VM_MAP, PAGE_KERNEL); + if (pb->pb_addr == NULL) + return -ENOMEM; + pb->pb_addr += pb->pb_offset; + pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED; + } + } + /* If some pages were found with data in them + * we are not in PBF_NONE state. + */ + if (good_pages != 0) { + pb->pb_flags &= ~(PBF_NONE); + if (good_pages != page_count) { + pb->pb_flags |= PBF_PARTIAL; + } + } + + PB_TRACE(pb, "lookup_pages", (long)good_pages); + + return rval; +} + +/* + * Finding and Reading Buffers + */ + +/* + * _pagebuf_find + * + * Looks up, and creates if absent, a lockable buffer for + * a given range of an inode. The buffer is returned + * locked. If other overlapping buffers exist, they are + * released before the new buffer is created and locked, + * which may imply that this call will block until those buffers + * are unlocked. No I/O is implied by this call. + */ +STATIC page_buf_t * +_pagebuf_find( /* find buffer for block */ + pb_target_t *target,/* target for block */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags, /* PBF_TRYLOCK */ + page_buf_t *new_pb)/* newly allocated buffer */ +{ + loff_t range_base; + size_t range_length; + int hval; + pb_hash_t *h; + struct list_head *p; + page_buf_t *pb; + int not_locked; + + range_base = (ioff << BBSHIFT); + range_length = (isize << BBSHIFT); + + /* Ensure we never do IOs smaller than the sector size */ + BUG_ON(range_length < (1 << target->pbr_sshift)); + + /* Ensure we never do IOs that are not sector aligned */ + BUG_ON(range_base & (loff_t)target->pbr_smask); + + hval = _bhash(target->pbr_bdev, range_base); + h = &pbhash[hval]; + + spin_lock(&h->pb_hash_lock); + list_for_each(p, &h->pb_hash) { + pb = list_entry(p, page_buf_t, pb_hash_list); + + if (pb->pb_target == target && + pb->pb_file_offset == range_base && + pb->pb_buffer_length == range_length && + atomic_read(&pb->pb_hold)) { + /* If we look at something bring it to the + * front of the list for next time + */ + atomic_inc(&pb->pb_hold); + list_move(&pb->pb_hash_list, &h->pb_hash); + goto found; + } + } + + /* No match found */ + if (new_pb) { + _pagebuf_initialize(new_pb, target, range_base, + range_length, flags | _PBF_LOCKABLE); + new_pb->pb_hash_index = hval; + list_add(&new_pb->pb_hash_list, &h->pb_hash); + } else { + PB_STATS_INC(pb_miss_locked); + } + + spin_unlock(&h->pb_hash_lock); + return (new_pb); + +found: + spin_unlock(&h->pb_hash_lock); + + /* Attempt to get the semaphore without sleeping, + * if this does not work then we need to drop the + * spinlock and do a hard attempt on the semaphore. + */ + not_locked = down_trylock(&pb->pb_sema); + if (not_locked) { + if (!(flags & PBF_TRYLOCK)) { + /* wait for buffer ownership */ + PB_TRACE(pb, "get_lock", 0); + pagebuf_lock(pb); + PB_STATS_INC(pb_get_locked_waited); + } else { + /* We asked for a trylock and failed, no need + * to look at file offset and length here, we + * know that this pagebuf at least overlaps our + * pagebuf and is locked, therefore our buffer + * either does not exist, or is this buffer + */ + + pagebuf_rele(pb); + PB_STATS_INC(pb_busy_locked); + return (NULL); + } + } else { + /* trylock worked */ + PB_SET_OWNER(pb); + } + + if (pb->pb_flags & PBF_STALE) + pb->pb_flags &= PBF_MAPPABLE | \ + PBF_MAPPED | \ + _PBF_LOCKABLE | \ + _PBF_ALL_PAGES_MAPPED | \ + _PBF_ADDR_ALLOCATED | \ + _PBF_MEM_ALLOCATED | \ + _PBF_MEM_SLAB; + PB_TRACE(pb, "got_lock", 0); + PB_STATS_INC(pb_get_locked); + return (pb); +} + + +/* + * pagebuf_find + * + * pagebuf_find returns a buffer matching the specified range of + * data for the specified target, if any of the relevant blocks + * are in memory. The buffer may have unallocated holes, if + * some, but not all, of the blocks are in memory. Even where + * pages are present in the buffer, not all of every page may be + * valid. + */ +page_buf_t * +pagebuf_find( /* find buffer for block */ + /* if the block is in memory */ + pb_target_t *target,/* target for block */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags) /* PBF_TRYLOCK */ +{ + return _pagebuf_find(target, ioff, isize, flags, NULL); +} + +/* + * pagebuf_get + * + * pagebuf_get assembles a buffer covering the specified range. + * Some or all of the blocks in the range may be valid. Storage + * in memory for all portions of the buffer will be allocated, + * although backing storage may not be. If PBF_READ is set in + * flags, pagebuf_iostart is called also. + */ +page_buf_t * +pagebuf_get( /* allocate a buffer */ + pb_target_t *target,/* target for buffer */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags) /* PBF_TRYLOCK */ +{ + page_buf_t *pb, *new_pb; + int error; + + new_pb = pagebuf_allocate(flags); + if (unlikely(!new_pb)) + return (NULL); + + pb = _pagebuf_find(target, ioff, isize, flags, new_pb); + if (pb != new_pb) { + pagebuf_deallocate(new_pb); + if (unlikely(!pb)) + return (NULL); + } + + PB_STATS_INC(pb_get); + + /* fill in any missing pages */ + error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags); + if (unlikely(error)) { + pagebuf_free(pb); + return (NULL); + } + + /* + * Always fill in the block number now, the mapped cases can do + * their own overlay of this later. + */ + pb->pb_bn = ioff; + pb->pb_count_desired = pb->pb_buffer_length; + + if (flags & PBF_READ) { + if (PBF_NOT_DONE(pb)) { + PB_TRACE(pb, "get_read", (unsigned long)flags); + PB_STATS_INC(pb_get_read); + pagebuf_iostart(pb, flags); + } else if (flags & PBF_ASYNC) { + PB_TRACE(pb, "get_read_async", (unsigned long)flags); + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + if (flags & (PBF_LOCK | PBF_TRYLOCK)) + pagebuf_unlock(pb); + pagebuf_rele(pb); + return NULL; + } else { + PB_TRACE(pb, "get_read_done", (unsigned long)flags); + /* We do not want read in the flags */ + pb->pb_flags &= ~PBF_READ; + } + } else { + PB_TRACE(pb, "get_write", (unsigned long)flags); + } + return (pb); +} + +/* + * Create a skeletal pagebuf (no pages associated with it). + */ +page_buf_t * +pagebuf_lookup( + struct pb_target *target, + loff_t ioff, + size_t isize, + page_buf_flags_t flags) +{ + page_buf_t *pb; + + pb = pagebuf_allocate(flags); + if (pb) { + _pagebuf_initialize(pb, target, ioff, isize, flags); + } + return pb; +} + +/* + * If we are not low on memory then do the readahead in a deadlock + * safe manner. + */ +void +pagebuf_readahead( + pb_target_t *target, + loff_t ioff, + size_t isize, + page_buf_flags_t flags) +{ + struct backing_dev_info *bdi; + + bdi = target->pbr_mapping->backing_dev_info; + if (bdi_read_congested(bdi)) + return; + if (bdi_write_congested(bdi)) + return; + + flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD); + pagebuf_get(target, ioff, isize, flags); +} + +page_buf_t * +pagebuf_get_empty( + size_t len, + pb_target_t *target) +{ + page_buf_t *pb; + + pb = pagebuf_allocate(_PBF_LOCKABLE); + if (pb) + _pagebuf_initialize(pb, target, 0, len, _PBF_LOCKABLE); + return pb; +} + +static inline struct page * +mem_to_page( + void *addr) +{ + if (((unsigned long)addr < VMALLOC_START) || + ((unsigned long)addr >= VMALLOC_END)) { + return virt_to_page(addr); + } else { + return vmalloc_to_page(addr); + } +} + +int +pagebuf_associate_memory( + page_buf_t *pb, + void *mem, + size_t len) +{ + int rval; + int i = 0; + size_t ptr; + size_t end, end_cur; + off_t offset; + int page_count; + + page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); + if (offset && (len > PAGE_CACHE_SIZE)) + page_count++; + + /* Free any previous set of page pointers */ + if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) { + kfree(pb->pb_pages); + } + pb->pb_pages = NULL; + pb->pb_addr = mem; + + rval = _pagebuf_get_pages(pb, page_count, 0); + if (rval) + return rval; + + pb->pb_offset = offset; + ptr = (size_t) mem & PAGE_CACHE_MASK; + end = PAGE_CACHE_ALIGN((size_t) mem + len); + end_cur = end; + /* set up first page */ + pb->pb_pages[0] = mem_to_page(mem); + + ptr += PAGE_CACHE_SIZE; + pb->pb_page_count = ++i; + while (ptr < end) { + pb->pb_pages[i] = mem_to_page((void *)ptr); + pb->pb_page_count = ++i; + ptr += PAGE_CACHE_SIZE; + } + pb->pb_locked = 0; + + pb->pb_count_desired = pb->pb_buffer_length = len; + pb->pb_flags |= PBF_MAPPED; + + return 0; +} + +page_buf_t * +pagebuf_get_no_daddr( + size_t len, + pb_target_t *target) +{ + int rval; + void *rmem = NULL; + page_buf_flags_t flags = _PBF_LOCKABLE | PBF_FORCEIO; + page_buf_t *pb; + size_t tlen = 0; + + if (unlikely(len > 0x20000)) + return NULL; + + pb = pagebuf_allocate(flags); + if (!pb) + return NULL; + + _pagebuf_initialize(pb, target, 0, len, flags); + + do { + if (tlen == 0) { + tlen = len; /* first time */ + } else { + kfree(rmem); /* free the mem from the previous try */ + tlen <<= 1; /* double the size and try again */ + } + if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) { + pagebuf_free(pb); + return NULL; + } + } while ((size_t)rmem != ((size_t)rmem & ~target->pbr_smask)); + + if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) { + kfree(rmem); + pagebuf_free(pb); + return NULL; + } + /* otherwise pagebuf_free just ignores it */ + pb->pb_flags |= (_PBF_MEM_ALLOCATED | _PBF_MEM_SLAB); + PB_CLEAR_OWNER(pb); + up(&pb->pb_sema); /* Return unlocked pagebuf */ + + PB_TRACE(pb, "no_daddr", rmem); + + return pb; +} + + +/* + * pagebuf_hold + * + * Increment reference count on buffer, to hold the buffer concurrently + * with another thread which may release (free) the buffer asynchronously. + * + * Must hold the buffer already to call this function. + */ +void +pagebuf_hold( + page_buf_t *pb) +{ + atomic_inc(&pb->pb_hold); + PB_TRACE(pb, "hold", 0); +} + +/* + * pagebuf_rele + * + * pagebuf_rele releases a hold on the specified buffer. If the + * the hold count is 1, pagebuf_rele calls pagebuf_free. + */ +void +pagebuf_rele( + page_buf_t *pb) +{ + PB_TRACE(pb, "rele", pb->pb_relse); + + if (atomic_dec_and_test(&pb->pb_hold)) { + int do_free = 1; + + if (pb->pb_relse) { + atomic_inc(&pb->pb_hold); + (*(pb->pb_relse)) (pb); + do_free = 0; + } + if (pb->pb_flags & PBF_DELWRI) { + pb->pb_flags |= PBF_ASYNC; + atomic_inc(&pb->pb_hold); + pagebuf_delwri_queue(pb, 0); + do_free = 0; + } else if (pb->pb_flags & PBF_FS_MANAGED) { + do_free = 0; + } + + if (do_free) { + pagebuf_free(pb); + } + } +} + + +/* + * Mutual exclusion on buffers. Locking model: + * + * Buffers associated with inodes for which buffer locking + * is not enabled are not protected by semaphores, and are + * assumed to be exclusively owned by the caller. There is a + * spinlock in the buffer, used by the caller when concurrent + * access is possible. + */ + +/* + * pagebuf_cond_lock + * + * pagebuf_cond_lock locks a buffer object, if it is not already locked. + * Note that this in no way + * locks the underlying pages, so it is only useful for synchronizing + * concurrent use of page buffer objects, not for synchronizing independent + * access to the underlying pages. + */ +int +pagebuf_cond_lock( /* lock buffer, if not locked */ + /* returns -EBUSY if locked) */ + page_buf_t *pb) +{ + int locked; + + ASSERT(pb->pb_flags & _PBF_LOCKABLE); + locked = down_trylock(&pb->pb_sema) == 0; + if (locked) { + PB_SET_OWNER(pb); + } + PB_TRACE(pb, "cond_lock", (long)locked); + return(locked ? 0 : -EBUSY); +} + +/* + * pagebuf_lock_value + * + * Return lock value for a pagebuf + */ +int +pagebuf_lock_value( + page_buf_t *pb) +{ + ASSERT(pb->pb_flags & _PBF_LOCKABLE); + return(atomic_read(&pb->pb_sema.count)); +} + +/* + * pagebuf_lock + * + * pagebuf_lock locks a buffer object. Note that this in no way + * locks the underlying pages, so it is only useful for synchronizing + * concurrent use of page buffer objects, not for synchronizing independent + * access to the underlying pages. + */ +int +pagebuf_lock( + page_buf_t *pb) +{ + ASSERT(pb->pb_flags & _PBF_LOCKABLE); + + PB_TRACE(pb, "lock", 0); + if (atomic_read(&pb->pb_io_remaining)) + blk_run_queues(); + down(&pb->pb_sema); + PB_SET_OWNER(pb); + PB_TRACE(pb, "locked", 0); + return 0; +} + +/* + * pagebuf_unlock + * + * pagebuf_unlock releases the lock on the buffer object created by + * pagebuf_lock or pagebuf_cond_lock (not any + * pinning of underlying pages created by pagebuf_pin). + */ +void +pagebuf_unlock( /* unlock buffer */ + page_buf_t *pb) /* buffer to unlock */ +{ + ASSERT(pb->pb_flags & _PBF_LOCKABLE); + PB_CLEAR_OWNER(pb); + up(&pb->pb_sema); + PB_TRACE(pb, "unlock", 0); +} + + +/* + * Pinning Buffer Storage in Memory + */ + +/* + * pagebuf_pin + * + * pagebuf_pin locks all of the memory represented by a buffer in + * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for + * the same or different buffers affecting a given page, will + * properly count the number of outstanding "pin" requests. The + * buffer may be released after the pagebuf_pin and a different + * buffer used when calling pagebuf_unpin, if desired. + * pagebuf_pin should be used by the file system when it wants be + * assured that no attempt will be made to force the affected + * memory to disk. It does not assure that a given logical page + * will not be moved to a different physical page. + */ +void +pagebuf_pin( + page_buf_t *pb) +{ + atomic_inc(&pb->pb_pin_count); + PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter); +} + +/* + * pagebuf_unpin + * + * pagebuf_unpin reverses the locking of memory performed by + * pagebuf_pin. Note that both functions affected the logical + * pages associated with the buffer, not the buffer itself. + */ +void +pagebuf_unpin( + page_buf_t *pb) +{ + if (atomic_dec_and_test(&pb->pb_pin_count)) { + wake_up_all(&pb->pb_waiters); + } + PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter); +} + +int +pagebuf_ispin( + page_buf_t *pb) +{ + return atomic_read(&pb->pb_pin_count); +} + +/* + * pagebuf_wait_unpin + * + * pagebuf_wait_unpin waits until all of the memory associated + * with the buffer is not longer locked in memory. It returns + * immediately if none of the affected pages are locked. + */ +static inline void +_pagebuf_wait_unpin( + page_buf_t *pb) +{ + DECLARE_WAITQUEUE (wait, current); + + if (atomic_read(&pb->pb_pin_count) == 0) + return; + + add_wait_queue(&pb->pb_waiters, &wait); + for (;;) { + current->state = TASK_UNINTERRUPTIBLE; + if (atomic_read(&pb->pb_pin_count) == 0) + break; + if (atomic_read(&pb->pb_io_remaining)) + blk_run_queues(); + schedule(); + } + remove_wait_queue(&pb->pb_waiters, &wait); + current->state = TASK_RUNNING; +} + +/* + * Buffer Utility Routines + */ + +/* + * pagebuf_iodone + * + * pagebuf_iodone marks a buffer for which I/O is in progress + * done with respect to that I/O. The pb_iodone routine, if + * present, will be called as a side-effect. + */ +void +pagebuf_iodone_work( + void *v) +{ + page_buf_t *pb = (page_buf_t *)v; + + if (pb->pb_iodone) { + (*(pb->pb_iodone)) (pb); + return; + } + + if (pb->pb_flags & PBF_ASYNC) { + if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse) + pagebuf_unlock(pb); + pagebuf_rele(pb); + } +} + +void +pagebuf_iodone( + page_buf_t *pb, + int dataio, + int schedule) +{ + pb->pb_flags &= ~(PBF_READ | PBF_WRITE); + if (pb->pb_error == 0) { + pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE); + } + + PB_TRACE(pb, "iodone", pb->pb_iodone); + + if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) { + if (schedule) { + INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb); + queue_work(dataio ? pagebuf_dataio_workqueue : + pagebuf_logio_workqueue, &pb->pb_iodone_work); + } else { + pagebuf_iodone_work(pb); + } + } else { + up(&pb->pb_iodonesema); + } +} + +/* + * pagebuf_ioerror + * + * pagebuf_ioerror sets the error code for a buffer. + */ +void +pagebuf_ioerror( /* mark/clear buffer error flag */ + page_buf_t *pb, /* buffer to mark */ + unsigned int error) /* error to store (0 if none) */ +{ + pb->pb_error = error; + PB_TRACE(pb, "ioerror", (unsigned long)error); +} + +/* + * pagebuf_iostart + * + * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied. + * If necessary, it will arrange for any disk space allocation required, + * and it will break up the request if the block mappings require it. + * The pb_iodone routine in the buffer supplied will only be called + * when all of the subsidiary I/O requests, if any, have been completed. + * pagebuf_iostart calls the pagebuf_ioinitiate routine or + * pagebuf_iorequest, if the former routine is not defined, to start + * the I/O on a given low-level request. + */ +int +pagebuf_iostart( /* start I/O on a buffer */ + page_buf_t *pb, /* buffer to start */ + page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */ + /* PBF_WRITE, PBF_DELWRI, */ + /* PBF_SYNC, PBF_DONT_BLOCK */ +{ + int status = 0; + + PB_TRACE(pb, "iostart", (unsigned long)flags); + + if (flags & PBF_DELWRI) { + pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC); + pb->pb_flags |= flags & + (PBF_DELWRI | PBF_ASYNC | PBF_SYNC); + pagebuf_delwri_queue(pb, 1); + return status; + } + + pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | \ + PBF_DELWRI | PBF_READ_AHEAD | PBF_RUN_QUEUES); + pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \ + PBF_SYNC | PBF_READ_AHEAD | PBF_RUN_QUEUES); + + BUG_ON(pb->pb_bn == PAGE_BUF_DADDR_NULL); + + /* For writes allow an alternate strategy routine to precede + * the actual I/O request (which may not be issued at all in + * a shutdown situation, for example). + */ + status = (flags & PBF_WRITE) ? + pagebuf_iostrategy(pb) : pagebuf_iorequest(pb); + + /* Wait for I/O if we are not an async request. + * Note: async I/O request completion will release the buffer, + * and that can already be done by this point. So using the + * buffer pointer from here on, after async I/O, is invalid. + */ + if (!status && !(flags & PBF_ASYNC)) + status = pagebuf_iowait(pb); + + return status; +} + +/* + * Helper routine for pagebuf_iorequest + */ + +STATIC __inline__ int +_pagebuf_iolocked( + page_buf_t *pb) +{ + ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE)); + if (pb->pb_flags & PBF_READ) + return pb->pb_locked; + return ((pb->pb_flags & _PBF_LOCKABLE) == 0); +} + +STATIC __inline__ void +_pagebuf_iodone( + page_buf_t *pb, + int schedule) +{ + if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { + pb->pb_locked = 0; + pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule); + } +} + +STATIC int +bio_end_io_pagebuf( + struct bio *bio, + unsigned int bytes_done, + int error) +{ + page_buf_t *pb = (page_buf_t *)bio->bi_private; + unsigned int i, blocksize = pb->pb_target->pbr_bsize; + unsigned int sectorshift = pb->pb_target->pbr_sshift; + struct bio_vec *bvec = bio->bi_io_vec; + + if (bio->bi_size) + return 1; + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + pb->pb_error = EIO; + + for (i = 0; i < bio->bi_vcnt; i++, bvec++) { + struct page *page = bvec->bv_page; + + if (pb->pb_error) { + SetPageError(page); + } else if (blocksize == PAGE_CACHE_SIZE) { + SetPageUptodate(page); + } else if (!PagePrivate(page)) { + unsigned int j, range; + + ASSERT(blocksize < PAGE_CACHE_SIZE); + range = (bvec->bv_offset + bvec->bv_len) >> sectorshift; + for (j = bvec->bv_offset >> sectorshift; j < range; j++) + set_bit(j, &page->private); + if (page->private == (unsigned long)(PAGE_CACHE_SIZE-1)) + SetPageUptodate(page); + } + + if (_pagebuf_iolocked(pb)) { + unlock_page(page); + } + } + + _pagebuf_iodone(pb, 1); + bio_put(bio); + return 0; +} + +void +_pagebuf_ioapply( + page_buf_t *pb) +{ + int i, map_i, total_nr_pages, nr_pages; + struct bio *bio; + int offset = pb->pb_offset; + int size = pb->pb_count_desired; + sector_t sector = pb->pb_bn; + unsigned int blocksize = pb->pb_target->pbr_bsize; + int locking = _pagebuf_iolocked(pb); + + total_nr_pages = pb->pb_page_count; + map_i = 0; + + /* Special code path for reading a sub page size pagebuf in -- + * we populate up the whole page, and hence the other metadata + * in the same page. This optimization is only valid when the + * filesystem block size and the page size are equal. + */ + if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) && + (pb->pb_flags & PBF_READ) && locking && + (blocksize == PAGE_CACHE_SIZE)) { + bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_bdev = pb->pb_target->pbr_bdev; + bio->bi_sector = sector - (offset >> BBSHIFT); + bio->bi_end_io = bio_end_io_pagebuf; + bio->bi_private = pb; + + bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0); + size = 0; + + atomic_inc(&pb->pb_io_remaining); + + goto submit_io; + } + + /* Lock down the pages which we need to for the request */ + if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) { + for (i = 0; size; i++) { + int nbytes = PAGE_CACHE_SIZE - offset; + struct page *page = pb->pb_pages[i]; + + if (nbytes > size) + nbytes = size; + + lock_page(page); + + size -= nbytes; + offset = 0; + } + offset = pb->pb_offset; + size = pb->pb_count_desired; + } + +next_chunk: + atomic_inc(&pb->pb_io_remaining); + nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); + if (nr_pages > total_nr_pages) + nr_pages = total_nr_pages; + + bio = bio_alloc(GFP_NOIO, nr_pages); + bio->bi_bdev = pb->pb_target->pbr_bdev; + bio->bi_sector = sector; + bio->bi_end_io = bio_end_io_pagebuf; + bio->bi_private = pb; + + for (; size && nr_pages; nr_pages--, map_i++) { + int nbytes = PAGE_CACHE_SIZE - offset; + + if (nbytes > size) + nbytes = size; + + if (bio_add_page(bio, pb->pb_pages[map_i], + nbytes, offset) < nbytes) + break; + + offset = 0; + sector += nbytes >> BBSHIFT; + size -= nbytes; + total_nr_pages--; + } + +submit_io: + if (likely(bio->bi_size)) { + submit_bio((pb->pb_flags & PBF_READ) ? READ : WRITE, bio); + if (size) + goto next_chunk; + } else { + bio_put(bio); + pagebuf_ioerror(pb, EIO); + } + + if (pb->pb_flags & PBF_RUN_QUEUES) { + pb->pb_flags &= ~PBF_RUN_QUEUES; + if (atomic_read(&pb->pb_io_remaining) > 1) + blk_run_queues(); + } +} + +/* + * pagebuf_iorequest + * + * pagebuf_iorequest is the core I/O request routine. + * It assumes that the buffer is well-formed and + * mapped and ready for physical I/O, unlike + * pagebuf_iostart() and pagebuf_iophysio(). Those + * routines call the pagebuf_ioinitiate routine to start I/O, + * if it is present, or else call pagebuf_iorequest() + * directly if the pagebuf_ioinitiate routine is not present. + * + * This function will be responsible for ensuring access to the + * pages is restricted whilst I/O is in progress - for locking + * pagebufs the pagebuf lock is the mediator, for non-locking + * pagebufs the pages will be locked. In the locking case we + * need to use the pagebuf lock as multiple meta-data buffers + * will reference the same page. + */ +int +pagebuf_iorequest( /* start real I/O */ + page_buf_t *pb) /* buffer to convey to device */ +{ + PB_TRACE(pb, "iorequest", 0); + + if (pb->pb_flags & PBF_DELWRI) { + pagebuf_delwri_queue(pb, 1); + return 0; + } + + if (pb->pb_flags & PBF_WRITE) { + _pagebuf_wait_unpin(pb); + } + + pagebuf_hold(pb); + + /* Set the count to 1 initially, this will stop an I/O + * completion callout which happens before we have started + * all the I/O from calling pagebuf_iodone too early. + */ + atomic_set(&pb->pb_io_remaining, 1); + _pagebuf_ioapply(pb); + _pagebuf_iodone(pb, 0); + + pagebuf_rele(pb); + return 0; +} + +/* + * pagebuf_iowait + * + * pagebuf_iowait waits for I/O to complete on the buffer supplied. + * It returns immediately if no I/O is pending. In any case, it returns + * the error code, if any, or 0 if there is no error. + */ +int +pagebuf_iowait( + page_buf_t *pb) +{ + PB_TRACE(pb, "iowait", 0); + if (atomic_read(&pb->pb_io_remaining)) + blk_run_queues(); + down(&pb->pb_iodonesema); + PB_TRACE(pb, "iowaited", (long)pb->pb_error); + return pb->pb_error; +} + +STATIC void * +pagebuf_mapout_locked( + page_buf_t *pb) +{ + void *old_addr = NULL; + + if (pb->pb_flags & PBF_MAPPED) { + if (pb->pb_flags & _PBF_ADDR_ALLOCATED) + old_addr = pb->pb_addr - pb->pb_offset; + pb->pb_addr = NULL; + pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED); + } + + return old_addr; /* Caller must free the address space, + * we are under a spin lock, probably + * not safe to do vfree here + */ +} + +caddr_t +pagebuf_offset( + page_buf_t *pb, + size_t offset) +{ + struct page *page; + + offset += pb->pb_offset; + + page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT]; + return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1)); +} + +/* + * pagebuf_iomove + * + * Move data into or out of a buffer. + */ +void +pagebuf_iomove( + page_buf_t *pb, /* buffer to process */ + size_t boff, /* starting buffer offset */ + size_t bsize, /* length to copy */ + caddr_t data, /* data address */ + page_buf_rw_t mode) /* read/write flag */ +{ + size_t bend, cpoff, csize; + struct page *page; + + bend = boff + bsize; + while (boff < bend) { + page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)]; + cpoff = page_buf_poff(boff + pb->pb_offset); + csize = min_t(size_t, + PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff); + + ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); + + switch (mode) { + case PBRW_ZERO: + memset(page_address(page) + cpoff, 0, csize); + break; + case PBRW_READ: + memcpy(data, page_address(page) + cpoff, csize); + break; + case PBRW_WRITE: + memcpy(page_address(page) + cpoff, data, csize); + } + + boff += csize; + data += csize; + } +} + + +/* + * Pagebuf delayed write buffer handling + */ + +STATIC LIST_HEAD(pbd_delwrite_queue); +STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED; + +STATIC void +pagebuf_delwri_queue( + page_buf_t *pb, + int unlock) +{ + PB_TRACE(pb, "delwri_q", (long)unlock); + spin_lock(&pbd_delwrite_lock); + /* If already in the queue, dequeue and place at tail */ + if (!list_empty(&pb->pb_list)) { + if (unlock) { + atomic_dec(&pb->pb_hold); + } + list_del(&pb->pb_list); + } + + list_add_tail(&pb->pb_list, &pbd_delwrite_queue); + pb->pb_flushtime = jiffies + pb_params.age_buffer.val; + spin_unlock(&pbd_delwrite_lock); + + if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) { + pagebuf_unlock(pb); + } +} + +void +pagebuf_delwri_dequeue( + page_buf_t *pb) +{ + PB_TRACE(pb, "delwri_uq", 0); + spin_lock(&pbd_delwrite_lock); + list_del_init(&pb->pb_list); + pb->pb_flags &= ~PBF_DELWRI; + spin_unlock(&pbd_delwrite_lock); +} + +STATIC void +pagebuf_runall_queues( + struct workqueue_struct *queue) +{ + flush_workqueue(queue); +} + +/* Defines for pagebuf daemon */ +STATIC DECLARE_COMPLETION(pagebuf_daemon_done); +STATIC struct task_struct *pagebuf_daemon_task; +STATIC int pagebuf_daemon_active; +STATIC int force_flush; + +STATIC void +pagebuf_daemon_wakeup(void) +{ + force_flush = 1; + barrier(); + wake_up_process(pagebuf_daemon_task); +} + +STATIC int +pagebuf_daemon( + void *data) +{ + int count; + page_buf_t *pb; + struct list_head *curr, *next, tmp; + + /* Set up the thread */ + daemonize("pagebufd"); + current->flags |= PF_MEMALLOC; + + pagebuf_daemon_task = current; + pagebuf_daemon_active = 1; + barrier(); + + INIT_LIST_HEAD(&tmp); + do { + /* swsusp */ + if (current->flags & PF_FREEZE) + refrigerator(PF_IOTHREAD); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(pb_params.flush_interval.val); + + spin_lock(&pbd_delwrite_lock); + + count = 0; + list_for_each_safe(curr, next, &pbd_delwrite_queue) { + pb = list_entry(curr, page_buf_t, pb_list); + + PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb)); + + if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) && + (((pb->pb_flags & _PBF_LOCKABLE) == 0) || + !pagebuf_cond_lock(pb))) { + + if (!force_flush && + time_before(jiffies, pb->pb_flushtime)) { + pagebuf_unlock(pb); + break; + } + + pb->pb_flags &= ~PBF_DELWRI; + pb->pb_flags |= PBF_WRITE; + + list_del(&pb->pb_list); + list_add(&pb->pb_list, &tmp); + + count++; + } + } + + spin_unlock(&pbd_delwrite_lock); + while (!list_empty(&tmp)) { + pb = list_entry(tmp.next, page_buf_t, pb_list); + list_del_init(&pb->pb_list); + + pagebuf_iostrategy(pb); + } + + if (as_list_len > 0) + purge_addresses(); + if (count) + blk_run_queues(); + + force_flush = 0; + } while (pagebuf_daemon_active); + + complete_and_exit(&pagebuf_daemon_done, 0); +} + +void +pagebuf_delwri_flush( + pb_target_t *target, + u_long flags, + int *pinptr) +{ + page_buf_t *pb; + struct list_head *curr, *next, tmp; + int pincount = 0; + int flush_cnt = 0; + + pagebuf_runall_queues(pagebuf_dataio_workqueue); + pagebuf_runall_queues(pagebuf_logio_workqueue); + + spin_lock(&pbd_delwrite_lock); + INIT_LIST_HEAD(&tmp); + + list_for_each_safe(curr, next, &pbd_delwrite_queue) { + pb = list_entry(curr, page_buf_t, pb_list); + + /* + * Skip other targets, markers and in progress buffers + */ + + if ((pb->pb_flags == 0) || (pb->pb_target != target) || + !(pb->pb_flags & PBF_DELWRI)) { + continue; + } + + PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb)); + if (pagebuf_ispin(pb)) { + pincount++; + continue; + } + + pb->pb_flags &= ~PBF_DELWRI; + pb->pb_flags |= PBF_WRITE; + list_move(&pb->pb_list, &tmp); + } + /* ok found all the items that can be worked on + * drop the lock and process the private list */ + spin_unlock(&pbd_delwrite_lock); + + list_for_each_safe(curr, next, &tmp) { + pb = list_entry(curr, page_buf_t, pb_list); + + if (flags & PBDF_WAIT) + pb->pb_flags &= ~PBF_ASYNC; + else + list_del_init(curr); + + pagebuf_lock(pb); + pagebuf_iostrategy(pb); + if (++flush_cnt > 32) { + blk_run_queues(); + flush_cnt = 0; + } + } + + blk_run_queues(); + + while (!list_empty(&tmp)) { + pb = list_entry(tmp.next, page_buf_t, pb_list); + + list_del_init(&pb->pb_list); + pagebuf_iowait(pb); + if (!pb->pb_relse) + pagebuf_unlock(pb); + pagebuf_rele(pb); + } + + if (pinptr) + *pinptr = pincount; +} + +STATIC int +pagebuf_daemon_start(void) +{ + int rval; + + pagebuf_logio_workqueue = create_workqueue("xfslogd"); + if (!pagebuf_logio_workqueue) + return -ENOMEM; + + pagebuf_dataio_workqueue = create_workqueue("xfsdatad"); + if (!pagebuf_dataio_workqueue) { + destroy_workqueue(pagebuf_logio_workqueue); + return -ENOMEM; + } + + rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES); + if (rval < 0) { + destroy_workqueue(pagebuf_logio_workqueue); + destroy_workqueue(pagebuf_dataio_workqueue); + } + + return rval; +} + +/* + * pagebuf_daemon_stop + * + * Note: do not mark as __exit, it is called from pagebuf_terminate. + */ +STATIC void +pagebuf_daemon_stop(void) +{ + pagebuf_daemon_active = 0; + barrier(); + wait_for_completion(&pagebuf_daemon_done); + + destroy_workqueue(pagebuf_logio_workqueue); + destroy_workqueue(pagebuf_dataio_workqueue); +} + + +/* + * Pagebuf sysctl interface + */ + +STATIC int +pb_stats_clear_handler( + ctl_table *ctl, + int write, + struct file *filp, + void *buffer, + size_t *lenp) +{ + int c, ret; + int *valp = ctl->data; + + ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp); + + if (!ret && write && *valp) { + printk("XFS Clearing pbstats\n"); + for (c = 0; c < NR_CPUS; c++) { + if (!cpu_possible(c)) continue; + memset(&per_cpu(pbstats, c), 0, + sizeof(struct pbstats)); + } + pb_params.stats_clear.val = 0; + } + + return ret; +} + +STATIC struct ctl_table_header *pagebuf_table_header; + +STATIC ctl_table pagebuf_table[] = { + {PB_FLUSH_INT, "flush_int", &pb_params.flush_interval.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &pb_params.flush_interval.min, &pb_params.flush_interval.max}, + + {PB_FLUSH_AGE, "flush_age", &pb_params.age_buffer.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &pb_params.age_buffer.min, &pb_params.age_buffer.max}, + + {PB_STATS_CLEAR, "stats_clear", &pb_params.stats_clear.val, + sizeof(int), 0644, NULL, &pb_stats_clear_handler, + &sysctl_intvec, NULL, + &pb_params.stats_clear.min, &pb_params.stats_clear.max}, + +#ifdef PAGEBUF_TRACE + {PB_DEBUG, "debug", &pb_params.debug.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &pb_params.debug.min, &pb_params.debug.max}, +#endif + {0} +}; + +STATIC ctl_table pagebuf_dir_table[] = { + {VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table}, + {0} +}; + +STATIC ctl_table pagebuf_root_table[] = { + {CTL_VM, "vm", NULL, 0, 0555, pagebuf_dir_table}, + {0} +}; + +#ifdef CONFIG_PROC_FS +STATIC int +pagebuf_readstats( + char *buffer, + char **start, + off_t offset, + int count, + int *eof, + void *data) +{ + int c, i, len, val; + + len = 0; + len += sprintf(buffer + len, "pagebuf"); + for (i = 0; i < sizeof(struct pbstats) / sizeof(u_int32_t); i++) { + val = 0; + for (c = 0 ; c < NR_CPUS; c++) { + if (!cpu_possible(c)) continue; + val += *(((u_int32_t*)&per_cpu(pbstats, c) + i)); + } + len += sprintf(buffer + len, " %u", val); + } + buffer[len++] = '\n'; + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} +#endif /* CONFIG_PROC_FS */ + +/* + * Initialization and Termination + */ + +int __init +pagebuf_init(void) +{ + int i; + + pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1); + +#ifdef CONFIG_PROC_FS + if (proc_mkdir("fs/pagebuf", 0)) + create_proc_read_entry( + "fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL); +#endif + + pagebuf_cache = kmem_cache_create("page_buf_t", sizeof(page_buf_t), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (pagebuf_cache == NULL) { + printk("pagebuf: couldn't init pagebuf cache\n"); + pagebuf_terminate(); + return -ENOMEM; + } + + for (i = 0; i < NHASH; i++) { + spin_lock_init(&pbhash[i].pb_hash_lock); + INIT_LIST_HEAD(&pbhash[i].pb_hash); + } + +#ifdef PAGEBUF_TRACE + pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP); +#endif + + pagebuf_daemon_start(); + return 0; +} + + +/* + * pagebuf_terminate. + * + * Note: do not mark as __exit, this is also called from the __init code. + */ +void +pagebuf_terminate(void) +{ + pagebuf_daemon_stop(); + +#ifdef PAGEBUF_TRACE + ktrace_free(pagebuf_trace_buf); +#endif + + kmem_cache_destroy(pagebuf_cache); + + unregister_sysctl_table(pagebuf_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("fs/pagebuf/stat", NULL); + remove_proc_entry("fs/pagebuf", NULL); +#endif +} diff -Nru a/fs/xfs/linux/xfs_buf.h b/fs/xfs/linux/xfs_buf.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/linux/xfs_buf.h Thu Jan 29 23:15:58 2004 @@ -0,0 +1,616 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI + */ + +#ifndef __XFS_BUF_H__ +#define __XFS_BUF_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Base types + */ + +/* daddr must be signed since -1 is used for bmaps that are not yet allocated */ +typedef loff_t page_buf_daddr_t; + +#define PAGE_BUF_DADDR_NULL ((page_buf_daddr_t) (-1LL)) + +#define page_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) +#define page_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) +#define page_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) +#define page_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) + +typedef enum page_buf_rw_e { + PBRW_READ = 1, /* transfer into target memory */ + PBRW_WRITE = 2, /* transfer from target memory */ + PBRW_ZERO = 3 /* Zero target memory */ +} page_buf_rw_t; + + +typedef enum page_buf_flags_e { /* pb_flags values */ + PBF_READ = (1 << 0), /* buffer intended for reading from device */ + PBF_WRITE = (1 << 1), /* buffer intended for writing to device */ + PBF_MAPPED = (1 << 2), /* buffer mapped (pb_addr valid) */ + PBF_PARTIAL = (1 << 3), /* buffer partially read */ + PBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ + PBF_NONE = (1 << 5), /* buffer not read at all */ + PBF_DELWRI = (1 << 6), /* buffer has dirty pages */ + PBF_SYNC = (1 << 8), /* force updates to disk */ + PBF_MAPPABLE = (1 << 9),/* use directly-addressable pages */ + PBF_STALE = (1 << 10), /* buffer has been staled, do not find it */ + PBF_FS_MANAGED = (1 << 11), /* filesystem controls freeing memory */ + PBF_FS_DATAIOD = (1 << 12), /* schedule IO completion on fs datad */ + + /* flags used only as arguments to access routines */ + PBF_LOCK = (1 << 13), /* lock requested */ + PBF_TRYLOCK = (1 << 14), /* lock requested, but do not wait */ + PBF_DONT_BLOCK = (1 << 15), /* do not block in current thread */ + + /* flags used only internally */ + _PBF_LOCKABLE = (1 << 16), /* page_buf_t may be locked */ + _PBF_ALL_PAGES_MAPPED = (1 << 18), /* all pages in range mapped */ + _PBF_ADDR_ALLOCATED = (1 << 19), /* pb_addr space was allocated */ + _PBF_MEM_ALLOCATED = (1 << 20), /* underlying pages are allocated */ + _PBF_MEM_SLAB = (1 << 21), /* underlying pages are slab allocated */ + + PBF_FORCEIO = (1 << 22), /* ignore any cache state */ + PBF_FLUSH = (1 << 23), /* flush disk write cache */ + PBF_READ_AHEAD = (1 << 24), /* asynchronous read-ahead */ + PBF_RUN_QUEUES = (1 << 25), /* run block device task queue */ + +} page_buf_flags_t; + +#define PBF_UPDATE (PBF_READ | PBF_WRITE) +#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0) +#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0) + +typedef struct pb_target { + dev_t pbr_dev; + struct block_device *pbr_bdev; + struct address_space *pbr_mapping; + unsigned int pbr_bsize; + unsigned int pbr_sshift; + size_t pbr_smask; +} pb_target_t; + +/* + * page_buf_t: Buffer structure for page cache-based buffers + * + * This buffer structure is used by the page cache buffer management routines + * to refer to an assembly of pages forming a logical buffer. The actual + * I/O is performed with buffer_head or bio structures, as required by drivers, + * for drivers which do not understand this structure. The buffer structure is + * used on temporary basis only, and discarded when released. + * + * The real data storage is recorded in the page cache. Metadata is + * hashed to the inode for the block device on which the file system resides. + * File data is hashed to the inode for the file. Pages which are only + * partially filled with data have bits set in their block_map entry + * to indicate which disk blocks in the page are not valid. + */ + +struct page_buf_s; +typedef void (*page_buf_iodone_t)(struct page_buf_s *); + /* call-back function on I/O completion */ +typedef void (*page_buf_relse_t)(struct page_buf_s *); + /* call-back function on I/O completion */ +typedef int (*page_buf_bdstrat_t)(struct page_buf_s *); + +#define PB_PAGES 4 + +typedef struct page_buf_s { + struct semaphore pb_sema; /* semaphore for lockables */ + unsigned long pb_flushtime; /* time to flush pagebuf */ + atomic_t pb_pin_count; /* pin count */ + wait_queue_head_t pb_waiters; /* unpin waiters */ + struct list_head pb_list; + page_buf_flags_t pb_flags; /* status flags */ + struct list_head pb_hash_list; + struct pb_target *pb_target; /* logical object */ + atomic_t pb_hold; /* reference count */ + page_buf_daddr_t pb_bn; /* block number for I/O */ + loff_t pb_file_offset; /* offset in file */ + size_t pb_buffer_length; /* size of buffer in bytes */ + size_t pb_count_desired; /* desired transfer size */ + void *pb_addr; /* virtual address of buffer */ + struct work_struct pb_iodone_work; + atomic_t pb_io_remaining;/* #outstanding I/O requests */ + page_buf_iodone_t pb_iodone; /* I/O completion function */ + page_buf_relse_t pb_relse; /* releasing function */ + page_buf_bdstrat_t pb_strat; /* pre-write function */ + struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */ + void *pb_fspriv; + void *pb_fspriv2; + void *pb_fspriv3; + unsigned short pb_error; /* error code on I/O */ + unsigned short pb_page_count; /* size of page array */ + unsigned short pb_offset; /* page offset in first page */ + unsigned char pb_locked; /* page array is locked */ + unsigned char pb_hash_index; /* hash table index */ + struct page **pb_pages; /* array of page pointers */ + struct page *pb_page_array[PB_PAGES]; /* inline pages */ +#ifdef PAGEBUF_LOCK_TRACKING + int pb_last_holder; +#endif +} page_buf_t; + + +/* Finding and Reading Buffers */ + +extern page_buf_t *pagebuf_find( /* find buffer for block if */ + /* the block is in memory */ + struct pb_target *, /* inode for block */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_LOCK */ + +extern page_buf_t *pagebuf_get( /* allocate a buffer */ + struct pb_target *, /* inode for buffer */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_LOCK, PBF_READ, */ + /* PBF_ASYNC */ + +extern page_buf_t *pagebuf_lookup( + struct pb_target *, + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_READ, PBF_WRITE, */ + /* PBF_FORCEIO, _PBF_LOCKABLE */ + +extern page_buf_t *pagebuf_get_empty( /* allocate pagebuf struct with */ + /* no memory or disk address */ + size_t len, + struct pb_target *); /* mount point "fake" inode */ + +extern page_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct */ + /* without disk address */ + size_t len, + struct pb_target *); /* mount point "fake" inode */ + +extern int pagebuf_associate_memory( + page_buf_t *, + void *, + size_t); + +extern void pagebuf_hold( /* increment reference count */ + page_buf_t *); /* buffer to hold */ + +extern void pagebuf_readahead( /* read ahead into cache */ + struct pb_target *, /* target for buffer (or NULL) */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* additional read flags */ + +/* Releasing Buffers */ + +extern void pagebuf_free( /* deallocate a buffer */ + page_buf_t *); /* buffer to deallocate */ + +extern void pagebuf_rele( /* release hold on a buffer */ + page_buf_t *); /* buffer to release */ + +/* Locking and Unlocking Buffers */ + +extern int pagebuf_cond_lock( /* lock buffer, if not locked */ + /* (returns -EBUSY if locked) */ + page_buf_t *); /* buffer to lock */ + +extern int pagebuf_lock_value( /* return count on lock */ + page_buf_t *); /* buffer to check */ + +extern int pagebuf_lock( /* lock buffer */ + page_buf_t *); /* buffer to lock */ + +extern void pagebuf_unlock( /* unlock buffer */ + page_buf_t *); /* buffer to unlock */ + +/* Buffer Read and Write Routines */ + +extern void pagebuf_iodone( /* mark buffer I/O complete */ + page_buf_t *, /* buffer to mark */ + int, /* use data/log helper thread. */ + int); /* run completion locally, or in + * a helper thread. */ + +extern void pagebuf_ioerror( /* mark buffer in error (or not) */ + page_buf_t *, /* buffer to mark */ + unsigned int); /* error to store (0 if none) */ + +extern int pagebuf_iostart( /* start I/O on a buffer */ + page_buf_t *, /* buffer to start */ + page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC, */ + /* PBF_READ, PBF_WRITE, */ + /* PBF_DELWRI, PBF_SYNC */ + +extern int pagebuf_iorequest( /* start real I/O */ + page_buf_t *); /* buffer to convey to device */ + +extern int pagebuf_iowait( /* wait for buffer I/O done */ + page_buf_t *); /* buffer to wait on */ + +extern void pagebuf_iomove( /* move data in/out of pagebuf */ + page_buf_t *, /* buffer to manipulate */ + size_t, /* starting buffer offset */ + size_t, /* length in buffer */ + caddr_t, /* data pointer */ + page_buf_rw_t); /* direction */ + +static inline int pagebuf_iostrategy(page_buf_t *pb) +{ + return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb); +} + +static inline int pagebuf_geterror(page_buf_t *pb) +{ + return pb ? pb->pb_error : ENOMEM; +} + +/* Buffer Utility Routines */ + +extern caddr_t pagebuf_offset( /* pointer at offset in buffer */ + page_buf_t *, /* buffer to offset into */ + size_t); /* offset */ + +/* Pinning Buffer Storage in Memory */ + +extern void pagebuf_pin( /* pin buffer in memory */ + page_buf_t *); /* buffer to pin */ + +extern void pagebuf_unpin( /* unpin buffered data */ + page_buf_t *); /* buffer to unpin */ + +extern int pagebuf_ispin( /* check if buffer is pinned */ + page_buf_t *); /* buffer to check */ + +/* Delayed Write Buffer Routines */ + +#define PBDF_WAIT 0x01 +extern void pagebuf_delwri_flush( + pb_target_t *, + unsigned long, + int *); + +extern void pagebuf_delwri_dequeue( + page_buf_t *); + +/* Buffer Daemon Setup Routines */ + +extern int pagebuf_init(void); +extern void pagebuf_terminate(void); + + +#ifdef PAGEBUF_TRACE +extern ktrace_t *pagebuf_trace_buf; +extern void pagebuf_trace( + page_buf_t *, /* buffer being traced */ + char *, /* description of operation */ + void *, /* arbitrary diagnostic value */ + void *); /* return address */ +#else +# define pagebuf_trace(pb, id, ptr, ra) do { } while (0) +#endif + +#define pagebuf_target_name(target) \ + ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; }) + + + + + +/* These are just for xfs_syncsub... it sets an internal variable + * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t + */ +#define XFS_B_ASYNC PBF_ASYNC +#define XFS_B_DELWRI PBF_DELWRI +#define XFS_B_READ PBF_READ +#define XFS_B_WRITE PBF_WRITE +#define XFS_B_STALE PBF_STALE + +#define XFS_BUF_TRYLOCK PBF_TRYLOCK +#define XFS_INCORE_TRYLOCK PBF_TRYLOCK +#define XFS_BUF_LOCK PBF_LOCK +#define XFS_BUF_MAPPED PBF_MAPPED + +#define BUF_BUSY PBF_DONT_BLOCK + +#define XFS_BUF_BFLAGS(x) ((x)->pb_flags) +#define XFS_BUF_ZEROFLAGS(x) \ + ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_DELWRI)) + +#define XFS_BUF_STALE(x) ((x)->pb_flags |= XFS_B_STALE) +#define XFS_BUF_UNSTALE(x) ((x)->pb_flags &= ~XFS_B_STALE) +#define XFS_BUF_ISSTALE(x) ((x)->pb_flags & XFS_B_STALE) +#define XFS_BUF_SUPER_STALE(x) do { \ + XFS_BUF_STALE(x); \ + xfs_buf_undelay(x); \ + XFS_BUF_DONE(x); \ + } while (0) + +#define XFS_BUF_MANAGE PBF_FS_MANAGED +#define XFS_BUF_UNMANAGE(x) ((x)->pb_flags &= ~PBF_FS_MANAGED) + +static inline void xfs_buf_undelay(page_buf_t *pb) +{ + if (pb->pb_flags & PBF_DELWRI) { + if (pb->pb_list.next != &pb->pb_list) { + pagebuf_delwri_dequeue(pb); + pagebuf_rele(pb); + } else { + pb->pb_flags &= ~PBF_DELWRI; + } + } +} + +#define XFS_BUF_DELAYWRITE(x) ((x)->pb_flags |= PBF_DELWRI) +#define XFS_BUF_UNDELAYWRITE(x) xfs_buf_undelay(x) +#define XFS_BUF_ISDELAYWRITE(x) ((x)->pb_flags & PBF_DELWRI) + +#define XFS_BUF_ERROR(x,no) pagebuf_ioerror(x,no) +#define XFS_BUF_GETERROR(x) pagebuf_geterror(x) +#define XFS_BUF_ISERROR(x) (pagebuf_geterror(x)?1:0) + +#define XFS_BUF_DONE(x) ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE)) +#define XFS_BUF_UNDONE(x) ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE) +#define XFS_BUF_ISDONE(x) (!(PBF_NOT_DONE(x))) + +#define XFS_BUF_BUSY(x) ((x)->pb_flags |= PBF_FORCEIO) +#define XFS_BUF_UNBUSY(x) ((x)->pb_flags &= ~PBF_FORCEIO) +#define XFS_BUF_ISBUSY(x) (1) + +#define XFS_BUF_ASYNC(x) ((x)->pb_flags |= PBF_ASYNC) +#define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC) +#define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC) + +#define XFS_BUF_FLUSH(x) ((x)->pb_flags |= PBF_FLUSH) +#define XFS_BUF_UNFLUSH(x) ((x)->pb_flags &= ~PBF_FLUSH) +#define XFS_BUF_ISFLUSH(x) ((x)->pb_flags & PBF_FLUSH) + +#define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n") +#define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n") +#define XFS_BUF_ISSHUT(x) (0) + +#define XFS_BUF_HOLD(x) pagebuf_hold(x) +#define XFS_BUF_READ(x) ((x)->pb_flags |= PBF_READ) +#define XFS_BUF_UNREAD(x) ((x)->pb_flags &= ~PBF_READ) +#define XFS_BUF_ISREAD(x) ((x)->pb_flags & PBF_READ) + +#define XFS_BUF_WRITE(x) ((x)->pb_flags |= PBF_WRITE) +#define XFS_BUF_UNWRITE(x) ((x)->pb_flags &= ~PBF_WRITE) +#define XFS_BUF_ISWRITE(x) ((x)->pb_flags & PBF_WRITE) + +#define XFS_BUF_ISUNINITIAL(x) (0) +#define XFS_BUF_UNUNINITIAL(x) (0) + +#define XFS_BUF_BP_ISMAPPED(bp) 1 + +typedef struct page_buf_s xfs_buf_t; +#define xfs_buf page_buf_s + +typedef struct pb_target xfs_buftarg_t; +#define xfs_buftarg pb_target + +#define XFS_BUF_DATAIO(x) ((x)->pb_flags |= PBF_FS_DATAIOD) +#define XFS_BUF_UNDATAIO(x) ((x)->pb_flags &= ~PBF_FS_DATAIOD) + +#define XFS_BUF_IODONE_FUNC(buf) (buf)->pb_iodone +#define XFS_BUF_SET_IODONE_FUNC(buf, func) \ + (buf)->pb_iodone = (func) +#define XFS_BUF_CLR_IODONE_FUNC(buf) \ + (buf)->pb_iodone = NULL +#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func) \ + (buf)->pb_strat = (func) +#define XFS_BUF_CLR_BDSTRAT_FUNC(buf) \ + (buf)->pb_strat = NULL + +#define XFS_BUF_FSPRIVATE(buf, type) \ + ((type)(buf)->pb_fspriv) +#define XFS_BUF_SET_FSPRIVATE(buf, value) \ + (buf)->pb_fspriv = (void *)(value) +#define XFS_BUF_FSPRIVATE2(buf, type) \ + ((type)(buf)->pb_fspriv2) +#define XFS_BUF_SET_FSPRIVATE2(buf, value) \ + (buf)->pb_fspriv2 = (void *)(value) +#define XFS_BUF_FSPRIVATE3(buf, type) \ + ((type)(buf)->pb_fspriv3) +#define XFS_BUF_SET_FSPRIVATE3(buf, value) \ + (buf)->pb_fspriv3 = (void *)(value) +#define XFS_BUF_SET_START(buf) + +#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \ + (buf)->pb_relse = (value) + +#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->pb_addr) + +extern inline xfs_caddr_t xfs_buf_offset(page_buf_t *bp, size_t offset) +{ + if (bp->pb_flags & PBF_MAPPED) + return XFS_BUF_PTR(bp) + offset; + return (xfs_caddr_t) pagebuf_offset(bp, offset); +} + +#define XFS_BUF_SET_PTR(bp, val, count) \ + pagebuf_associate_memory(bp, val, count) +#define XFS_BUF_ADDR(bp) ((bp)->pb_bn) +#define XFS_BUF_SET_ADDR(bp, blk) \ + ((bp)->pb_bn = (page_buf_daddr_t)(blk)) +#define XFS_BUF_OFFSET(bp) ((bp)->pb_file_offset) +#define XFS_BUF_SET_OFFSET(bp, off) \ + ((bp)->pb_file_offset = (off)) +#define XFS_BUF_COUNT(bp) ((bp)->pb_count_desired) +#define XFS_BUF_SET_COUNT(bp, cnt) \ + ((bp)->pb_count_desired = (cnt)) +#define XFS_BUF_SIZE(bp) ((bp)->pb_buffer_length) +#define XFS_BUF_SET_SIZE(bp, cnt) \ + ((bp)->pb_buffer_length = (cnt)) +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) +#define XFS_BUF_SET_VTYPE(bp, type) +#define XFS_BUF_SET_REF(bp, ref) + +#define XFS_BUF_ISPINNED(bp) pagebuf_ispin(bp) + +#define XFS_BUF_VALUSEMA(bp) pagebuf_lock_value(bp) +#define XFS_BUF_CPSEMA(bp) (pagebuf_cond_lock(bp) == 0) +#define XFS_BUF_VSEMA(bp) pagebuf_unlock(bp) +#define XFS_BUF_PSEMA(bp,x) pagebuf_lock(bp) +#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema); + +/* setup the buffer target from a buftarg structure */ +#define XFS_BUF_SET_TARGET(bp, target) \ + (bp)->pb_target = (target) +#define XFS_BUF_TARGET(bp) ((bp)->pb_target) +#define XFS_BUFTARG_NAME(target) \ + pagebuf_target_name(target) + +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) +#define XFS_BUF_SET_VTYPE(bp, type) +#define XFS_BUF_SET_REF(bp, ref) + +#define xfs_buf_read(target, blkno, len, flags) \ + pagebuf_get((target), (blkno), (len), \ + PBF_LOCK | PBF_READ | PBF_MAPPED | PBF_MAPPABLE) +#define xfs_buf_get(target, blkno, len, flags) \ + pagebuf_get((target), (blkno), (len), \ + PBF_LOCK | PBF_MAPPED | PBF_MAPPABLE) + +#define xfs_buf_read_flags(target, blkno, len, flags) \ + pagebuf_get((target), (blkno), (len), \ + PBF_READ | PBF_MAPPABLE | flags) +#define xfs_buf_get_flags(target, blkno, len, flags) \ + pagebuf_get((target), (blkno), (len), \ + PBF_MAPPABLE | flags) + +static inline int xfs_bawrite(void *mp, page_buf_t *bp) +{ + bp->pb_fspriv3 = mp; + bp->pb_strat = xfs_bdstrat_cb; + xfs_buf_undelay(bp); + return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | PBF_RUN_QUEUES); +} + +static inline void xfs_buf_relse(page_buf_t *bp) +{ + if ((bp->pb_flags & _PBF_LOCKABLE) && !bp->pb_relse) + pagebuf_unlock(bp); + pagebuf_rele(bp); +} + +#define xfs_bpin(bp) pagebuf_pin(bp) +#define xfs_bunpin(bp) pagebuf_unpin(bp) + +#define xfs_buftrace(id, bp) \ + pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0)) + +#define xfs_biodone(pb) \ + pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), 0) + +#define xfs_incore(buftarg,blkno,len,lockit) \ + pagebuf_find(buftarg, blkno ,len, lockit) + + +#define xfs_biomove(pb, off, len, data, rw) \ + pagebuf_iomove((pb), (off), (len), (data), \ + ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ) + +#define xfs_biozero(pb, off, len) \ + pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO) + + +static inline int XFS_bwrite(page_buf_t *pb) +{ + int iowait = (pb->pb_flags & PBF_ASYNC) == 0; + int error = 0; + + pb->pb_flags |= PBF_SYNC; + if (!iowait) + pb->pb_flags |= PBF_RUN_QUEUES; + + xfs_buf_undelay(pb); + pagebuf_iostrategy(pb); + if (iowait) { + error = pagebuf_iowait(pb); + xfs_buf_relse(pb); + } + return error; +} + +#define XFS_bdwrite(pb) \ + pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC) + +static inline int xfs_bdwrite(void *mp, page_buf_t *bp) +{ + bp->pb_strat = xfs_bdstrat_cb; + bp->pb_fspriv3 = mp; + + return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC); +} + +#define XFS_bdstrat(bp) pagebuf_iorequest(bp) + +#define xfs_iowait(pb) pagebuf_iowait(pb) + + +/* + * Go through all incore buffers, and release buffers + * if they belong to the given device. This is used in + * filesystem error handling to preserve the consistency + * of its metadata. + */ + +#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg) + +#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg) + +#define xfs_incore_relse(buftarg,delwri_only,wait) \ + xfs_relse_buftarg(buftarg) + +#define xfs_baread(target, rablkno, ralen) \ + pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK) + +#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target)) +#define xfs_buf_get_noaddr(len, target) pagebuf_get_no_daddr((len), (target)) +#define xfs_buf_free(bp) pagebuf_free(bp) + +#endif /* __XFS_BUF_H__ */ + diff -Nru a/fs/xfs/linux/xfs_globals.c b/fs/xfs/linux/xfs_globals.c --- a/fs/xfs/linux/xfs_globals.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/linux/xfs_globals.c Thu Jan 29 23:15:58 2004 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as @@ -36,57 +36,8 @@ */ #include "xfs.h" - -#include "xfs_fs.h" -#include "xfs_buf.h" -#include "xfs_inum.h" -#include "xfs_log.h" -#include "xfs_clnt.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir.h" -#include "xfs_dir2.h" -#include "xfs_imap.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_alloc_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_attr_sf.h" -#include "xfs_dir_sf.h" -#include "xfs_dir2_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_bit.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_da_btree.h" -#include "xfs_dir_leaf.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_node.h" -#include "xfs_dir2_trace.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_attr_leaf.h" -#include "xfs_inode_item.h" -#include "xfs_buf_item.h" -#include "xfs_extfree_item.h" -#include "xfs_log_priv.h" -#include "xfs_trans_priv.h" -#include "xfs_trans_space.h" -#include "xfs_utils.h" +#include "xfs_cred.h" +#include "xfs_sysctl.h" /* * System memory size - used to scale certain data structures in XFS. @@ -117,34 +68,3 @@ */ cred_t sys_cred_val, *sys_cred = &sys_cred_val; -/* - * Export symbols used for XFS debugging - */ -EXPORT_SYMBOL(xfs_next_bit); -EXPORT_SYMBOL(xfs_contig_bits); -EXPORT_SYMBOL(xfs_bmbt_get_all); -#if ARCH_CONVERT != ARCH_NOCONVERT -EXPORT_SYMBOL(xfs_bmbt_disk_get_all); -#endif - -/* - * Export symbols used for XFS tracing - */ -#ifdef XFS_ALLOC_TRACE -EXPORT_SYMBOL(xfs_alloc_trace_buf); -#endif -#ifdef XFS_BMAP_TRACE -EXPORT_SYMBOL(xfs_bmap_trace_buf); -#endif -#ifdef XFS_BMBT_TRACE -EXPORT_SYMBOL(xfs_bmbt_trace_buf); -#endif -#ifdef XFS_ATTR_TRACE -EXPORT_SYMBOL(xfs_attr_trace_buf); -#endif -#ifdef XFS_DIR2_TRACE -EXPORT_SYMBOL(xfs_dir2_trace_buf); -#endif -#ifdef XFS_DIR_TRACE -EXPORT_SYMBOL(xfs_dir_trace_buf); -#endif diff -Nru a/fs/xfs/linux/xfs_iomap.c b/fs/xfs/linux/xfs_iomap.c --- a/fs/xfs/linux/xfs_iomap.c Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,795 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ - -#include "xfs.h" - -#include "xfs_fs.h" -#include "xfs_inum.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_alloc_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_attr_sf.h" -#include "xfs_dir_sf.h" -#include "xfs_dir2_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_bit.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_trans_space.h" -#include "xfs_utils.h" -#include "xfs_iomap.h" - -#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ - << mp->m_writeio_log) -#define XFS_STRAT_WRITE_IMAPS 2 -#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP - -STATIC int -xfs_imap_to_bmap( - xfs_iocore_t *io, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - xfs_iomap_t *iomapp, - int imaps, /* Number of imap entries */ - int iomaps, /* Number of iomap entries */ - int flags) -{ - xfs_mount_t *mp; - xfs_fsize_t nisize; - int pbm; - xfs_fsblock_t start_block; - - mp = io->io_mount; - nisize = XFS_SIZE(mp, io); - if (io->io_new_size > nisize) - nisize = io->io_new_size; - - for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) { - iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ? - mp->m_rtdev_targp : mp->m_ddev_targp; - iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomapp->iomap_delta = offset - iomapp->iomap_offset; - iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomapp->iomap_flags = flags; - - start_block = imap->br_startblock; - if (start_block == HOLESTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags = IOMAP_HOLE; - } else if (start_block == DELAYSTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags = IOMAP_DELAY; - } else { - iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block); - if (ISUNWRITTEN(imap)) - iomapp->iomap_flags |= IOMAP_UNWRITTEN; - } - - if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) { - iomapp->iomap_flags |= IOMAP_EOF; - } - - offset += iomapp->iomap_bsize - iomapp->iomap_delta; - } - return pbm; /* Return the number filled */ -} - -int -xfs_iomap( - xfs_iocore_t *io, - xfs_off_t offset, - ssize_t count, - int flags, - xfs_iomap_t *iomapp, - int *niomaps) -{ - xfs_mount_t *mp = io->io_mount; - xfs_fileoff_t offset_fsb, end_fsb; - int error = 0; - int lockmode = 0; - xfs_bmbt_irec_t imap; - int nimaps = 1; - int bmapi_flags = 0; - int iomap_flags = 0; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - switch (flags & - (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE | - BMAPI_UNWRITTEN | BMAPI_DEVICE)) { - case BMAPI_READ: - lockmode = XFS_LCK_MAP_SHARED(mp, io); - bmapi_flags = XFS_BMAPI_ENTIRE; - if (flags & BMAPI_IGNSTATE) - bmapi_flags |= XFS_BMAPI_IGSTATE; - break; - case BMAPI_WRITE: - lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR; - bmapi_flags = 0; - XFS_ILOCK(mp, io, lockmode); - break; - case BMAPI_ALLOCATE: - lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD; - bmapi_flags = XFS_BMAPI_ENTIRE; - /* Attempt non-blocking lock */ - if (flags & BMAPI_TRYLOCK) { - if (!XFS_ILOCK_NOWAIT(mp, io, lockmode)) - return XFS_ERROR(EAGAIN); - } else { - XFS_ILOCK(mp, io, lockmode); - } - break; - case BMAPI_UNWRITTEN: - goto phase2; - case BMAPI_DEVICE: - lockmode = XFS_LCK_MAP_SHARED(mp, io); - iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ? - mp->m_rtdev_targp : mp->m_ddev_targp; - error = 0; - *niomaps = 1; - goto out; - default: - BUG(); - } - - ASSERT(offset <= mp->m_maxioffset); - if ((xfs_fsize_t)offset + count > mp->m_maxioffset) - count = mp->m_maxioffset - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - - error = XFS_BMAPI(mp, NULL, io, offset_fsb, - (xfs_filblks_t)(end_fsb - offset_fsb), - bmapi_flags, NULL, 0, &imap, - &nimaps, NULL); - - if (error) - goto out; - -phase2: - switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) { - case BMAPI_WRITE: - /* If we found an extent, return it */ - if (nimaps && (imap.br_startblock != HOLESTARTBLOCK)) - break; - - if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { - error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset, - count, flags, &imap, &nimaps, nimaps); - } else { - error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count, - flags, &imap, &nimaps); - } - iomap_flags = IOMAP_NEW; - break; - case BMAPI_ALLOCATE: - /* If we found an extent, return it */ - XFS_IUNLOCK(mp, io, lockmode); - lockmode = 0; - - if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) - break; - - error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, &imap, &nimaps); - break; - case BMAPI_UNWRITTEN: - lockmode = 0; - error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count); - nimaps = 0; - break; - } - - if (nimaps) { - *niomaps = xfs_imap_to_bmap(io, offset, &imap, - iomapp, nimaps, *niomaps, iomap_flags); - } else if (niomaps) { - *niomaps = 0; - } - -out: - if (lockmode) - XFS_IUNLOCK(mp, io, lockmode); - return XFS_ERROR(error); -} - -STATIC int -xfs_flush_space( - xfs_inode_t *ip, - int *fsynced, - int *ioflags) -{ - switch (*fsynced) { - case 0: - if (ip->i_delayed_blks) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inode(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); - *fsynced = 1; - } else { - *ioflags |= BMAPI_SYNC; - *fsynced = 2; - } - return 0; - case 1: - *fsynced = 2; - *ioflags |= BMAPI_SYNC; - return 0; - case 2: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_device(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); - *fsynced = 3; - return 0; - } - return 1; -} - -int -xfs_iomap_write_direct( - xfs_inode_t *ip, - loff_t offset, - size_t count, - int flags, - xfs_bmbt_irec_t *ret_imap, - int *nmaps, - int found) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_iocore_t *io = &ip->i_iocore; - xfs_fileoff_t offset_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t count_fsb; - xfs_fsize_t isize; - xfs_fsblock_t firstfsb; - int nimaps, maps; - int error; - int bmapi_flag; - int rt; - xfs_trans_t *tp; - xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp; - xfs_bmap_free_t free_list; - int aeof; - xfs_filblks_t datablocks; - int committed; - int numrtextents; - uint resblks; - - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - - error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED); - if (error) - return XFS_ERROR(error); - - maps = min(XFS_WRITE_IMAPS, *nmaps); - nimaps = maps; - - isize = ip->i_d.di_size; - aeof = (offset + count) > isize; - - if (io->io_new_size > isize) - isize = io->io_new_size; - - offset_fsb = XFS_B_TO_FSBT(mp, offset); - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - count_fsb = last_fsb - offset_fsb; - if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) { - xfs_fileoff_t map_last_fsb; - - map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff; - - if (map_last_fsb < last_fsb) { - last_fsb = map_last_fsb; - count_fsb = last_fsb - offset_fsb; - } - ASSERT(count_fsb > 0); - } - - /* - * determine if reserving space on - * the data or realtime partition. - */ - if ((rt = XFS_IS_REALTIME_INODE(ip))) { - int sbrtextsize, iprtextsize; - - sbrtextsize = mp->m_sb.sb_rextsize; - iprtextsize = - ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize; - numrtextents = (count_fsb + iprtextsize - 1); - do_div(numrtextents, sbrtextsize); - datablocks = 0; - } else { - datablocks = count_fsb; - numrtextents = 0; - } - - /* - * allocate and setup the transaction - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - - resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks); - - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), numrtextents, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - - /* - * check for running out of space - */ - if (error) - /* - * Free the transaction structure. - */ - xfs_trans_cancel(tp, 0); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - - if (error) - goto error_out; /* Don't return in above if .. trans .., - need lock to return */ - - if (XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, resblks)) { - error = (EDQUOT); - goto error1; - } - nimaps = 1; - - bmapi_flag = XFS_BMAPI_WRITE; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - - if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt)) - bmapi_flag |= XFS_BMAPI_PREALLOC; - - /* - * issue the bmapi() call to allocate the blocks - */ - XFS_BMAP_INIT(&free_list, &firstfsb); - imapp = &imap[0]; - error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, - bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list); - if (error) { - goto error0; - } - - /* - * complete the transaction - */ - - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); - if (error) { - goto error0; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (error) { - goto error_out; - } - - /* copy any maps to caller's array and return any error. */ - if (nimaps == 0) { - error = (ENOSPC); - goto error_out; - } - - *ret_imap = imap[0]; - *nmaps = 1; - return 0; - - error0: /* Cancel bmap, unlock inode, and cancel trans */ - xfs_bmap_cancel(&free_list); - - error1: /* Just cancel transaction */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - *nmaps = 0; /* nothing set-up here */ - -error_out: - return XFS_ERROR(error); -} - -int -xfs_iomap_write_delay( - xfs_inode_t *ip, - loff_t offset, - size_t count, - int ioflag, - xfs_bmbt_irec_t *ret_imap, - int *nmaps) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_iocore_t *io = &ip->i_iocore; - xfs_fileoff_t offset_fsb; - xfs_fileoff_t last_fsb; - xfs_fsize_t isize; - xfs_fsblock_t firstblock; - int nimaps; - int error; - xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int aeof; - int fsynced = 0; - - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); - - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - - error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED); - if (error) - return XFS_ERROR(error); - -retry: - isize = ip->i_d.di_size; - if (io->io_new_size > isize) { - isize = io->io_new_size; - } - - aeof = 0; - offset_fsb = XFS_B_TO_FSBT(mp, offset); - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - /* - * If the caller is doing a write at the end of the file, - * then extend the allocation (and the buffer used for the write) - * out to the file system's write iosize. We clean up any extra - * space left over when the file is closed in xfs_inactive(). - * - * We don't bother with this for sync writes, because we need - * to minimize the amount we write for good performance. - */ - if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) { - xfs_off_t aligned_offset; - unsigned int iosize; - xfs_fileoff_t ioalign; - - iosize = mp->m_writeio_blocks; - aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); - ioalign = XFS_B_TO_FSBT(mp, aligned_offset); - last_fsb = ioalign + iosize; - aeof = 1; - } - - nimaps = XFS_WRITE_IMAPS; - firstblock = NULLFSBLOCK; - - /* - * roundup the allocation request to m_dalign boundary if file size - * is greater that 512K and we are allocating past the allocation eof - */ - if (mp->m_dalign && (isize >= mp->m_dalign) && aeof) { - int eof; - xfs_fileoff_t new_last_fsb; - new_last_fsb = roundup_64(last_fsb, mp->m_dalign); - error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); - if (error) { - return error; - } - if (eof) { - last_fsb = new_last_fsb; - } - } - - error = xfs_bmapi(NULL, ip, offset_fsb, - (xfs_filblks_t)(last_fsb - offset_fsb), - XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | - XFS_BMAPI_ENTIRE, &firstblock, 1, imap, - &nimaps, NULL); - /* - * This can be EDQUOT, if nimaps == 0 - */ - if (error && (error != ENOSPC)) { - return XFS_ERROR(error); - } - /* - * If bmapi returned us nothing, and if we didn't get back EDQUOT, - * then we must have run out of space. - */ - - if (nimaps == 0) { - if (xfs_flush_space(ip, &fsynced, &ioflag)) - return XFS_ERROR(ENOSPC); - - error = 0; - goto retry; - } - - *ret_imap = imap[0]; - *nmaps = 1; - return 0; -} - -/* - * Pass in a delayed allocate extent, convert it to real extents; - * return to the caller the extent we create which maps on top of - * the originating callers request. - * - * Called without a lock on the inode. - */ -int -xfs_iomap_write_allocate( - xfs_inode_t *ip, - xfs_bmbt_irec_t *map, - int *retmap) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t offset_fsb, last_block; - xfs_fileoff_t end_fsb, map_start_fsb; - xfs_fsblock_t first_block; - xfs_bmap_free_t free_list; - xfs_filblks_t count_fsb; - xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS]; - xfs_trans_t *tp; - int i, nimaps, committed; - int error = 0; - int nres; - - *retmap = 0; - - /* - * Make sure that the dquots are there. - */ - - if ((error = XFS_QM_DQATTACH(mp, ip, 0))) - return XFS_ERROR(error); - - offset_fsb = map->br_startoff; - count_fsb = map->br_blockcount; - map_start_fsb = offset_fsb; - - XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); - - while (count_fsb != 0) { - /* - * Set up a transaction with which to allocate the - * backing store for the file. Do allocations in a - * loop until we get some space in the range we are - * interested in. The other space that might be allocated - * is in the delayed allocation extent on which we sit - * but before our buffer starts. - */ - - nimaps = 0; - while (nimaps == 0) { - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); - nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_reserve(tp, nres, - XFS_WRITE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - - if (error == ENOSPC) { - error = xfs_trans_reserve(tp, 0, - XFS_WRITE_LOG_RES(mp), - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - } - if (error) { - xfs_trans_cancel(tp, 0); - return XFS_ERROR(error); - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - - XFS_BMAP_INIT(&free_list, &first_block); - - nimaps = XFS_STRAT_WRITE_IMAPS; - /* - * Ensure we don't go beyond eof - it is possible - * the extents changed since we did the read call, - * we dropped the ilock in the interim. - */ - - end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size); - xfs_bmap_last_offset(NULL, ip, &last_block, - XFS_DATA_FORK); - last_block = XFS_FILEOFF_MAX(last_block, end_fsb); - if ((map_start_fsb + count_fsb) > last_block) { - count_fsb = last_block - map_start_fsb; - if (count_fsb == 0) { - error = EAGAIN; - goto trans_cancel; - } - } - - /* Go get the actual blocks */ - error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, - XFS_BMAPI_WRITE, &first_block, 1, - imap, &nimaps, &free_list); - - if (error) - goto trans_cancel; - - error = xfs_bmap_finish(&tp, &free_list, - first_block, &committed); - - if (error) - goto trans_cancel; - - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES, NULL); - - if (error) - goto error0; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - /* - * See if we were able to allocate an extent that - * covers at least part of the callers request - */ - - for (i = 0; i < nimaps; i++) { - if ((map->br_startoff >= imap[i].br_startoff) && - (map->br_startoff < (imap[i].br_startoff + - imap[i].br_blockcount))) { - *map = imap[i]; - *retmap = 1; - XFS_STATS_INC(xs_xstrat_quick); - return 0; - } - count_fsb -= imap[i].br_blockcount; - } - - /* So far we have not mapped the requested part of the - * file, just surrounding data, try again. - */ - nimaps--; - offset_fsb = imap[nimaps].br_startoff + - imap[nimaps].br_blockcount; - map_start_fsb = offset_fsb; - } - -trans_cancel: - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); -error0: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return XFS_ERROR(error); -} - -int -xfs_iomap_write_unwritten( - xfs_inode_t *ip, - loff_t offset, - size_t count) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - xfs_fileoff_t offset_fsb; - xfs_filblks_t count_fsb; - xfs_filblks_t numblks_fsb; - xfs_bmbt_irec_t imap; - int committed; - int error; - int nres; - int nimaps; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - - offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = XFS_B_TO_FSB(mp, count); - - do { - nres = XFS_DIOSTRAT_SPACE_RES(mp, 0); - - /* - * set up a transaction to convert the range of extents - * from unwritten to real. Do allocations in a loop until - * we have covered the range passed in. - */ - - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); - error = xfs_trans_reserve(tp, nres, - XFS_WRITE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - if (error) { - xfs_trans_cancel(tp, 0); - goto error0; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - - /* - * Modify the unwritten extent state of the buffer. - */ - XFS_BMAP_INIT(&free_list, &firstfsb); - nimaps = 1; - error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_WRITE, &firstfsb, - 1, &imap, &nimaps, &free_list); - if (error) - goto error_on_bmapi_transaction; - - error = xfs_bmap_finish(&(tp), &(free_list), - firstfsb, &committed); - if (error) - goto error_on_bmapi_transaction; - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - goto error0; - - if ((numblks_fsb = imap.br_blockcount) == 0) { - /* - * The numblks_fsb value should always get - * smaller, otherwise the loop is stuck. - */ - ASSERT(imap.br_blockcount); - break; - } - offset_fsb += numblks_fsb; - count_fsb -= numblks_fsb; - } while (count_fsb > 0); - - return 0; - -error_on_bmapi_transaction: - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); - xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: - return XFS_ERROR(error); -} diff -Nru a/fs/xfs/linux/xfs_linux.h b/fs/xfs/linux/xfs_linux.h --- a/fs/xfs/linux/xfs_linux.h Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/linux/xfs_linux.h Thu Jan 29 23:15:58 2004 @@ -32,6 +32,44 @@ #ifndef __XFS_LINUX__ #define __XFS_LINUX__ +#include +#include + +/* + * Some types are conditional depending on the target system. + * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. + * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well + * as requiring XFS_BIG_BLKNOS to be set. + */ +#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) +# define XFS_BIG_BLKNOS 1 +# if BITS_PER_LONG == 64 +# define XFS_BIG_INUMS 1 +# else +# define XFS_BIG_INUMS 0 +# endif +#else +# define XFS_BIG_BLKNOS 0 +# define XFS_BIG_INUMS 0 +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + #include #include #include @@ -55,19 +93,18 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* * Feature macros (disable/enable) diff -Nru a/fs/xfs/linux/xfs_super.h b/fs/xfs/linux/xfs_super.h --- a/fs/xfs/linux/xfs_super.h Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/linux/xfs_super.h Thu Jan 29 23:15:58 2004 @@ -60,6 +60,14 @@ # define set_posix_acl_flag(sb) do { } while (0) #endif +#ifdef CONFIG_XFS_SECURITY +# define XFS_SECURITY_STRING "security attrs, " +# define ENOSECURITY 0 +#else +# define XFS_SECURITY_STRING +# define ENOSECURITY EOPNOTSUPP +#endif + #ifdef CONFIG_XFS_RT # define XFS_REALTIME_STRING "realtime, " #else @@ -89,6 +97,7 @@ #endif #define XFS_BUILD_OPTIONS XFS_ACL_STRING \ + XFS_SECURITY_STRING \ XFS_REALTIME_STRING \ XFS_BIGFS_STRING \ XFS_TRACE_STRING \ diff -Nru a/fs/xfs/pagebuf/page_buf.c b/fs/xfs/pagebuf/page_buf.c --- a/fs/xfs/pagebuf/page_buf.c Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,2107 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ - -/* - * page_buf.c - * - * The page_buf module provides an abstract buffer cache model on top of - * the Linux page cache. Cached metadata blocks for a file system are - * hashed to the inode for the block device. The page_buf module - * assembles buffer (page_buf_t) objects on demand to aggregate such - * cached pages for I/O. - * - * - * Written by Steve Lord, Jim Mostek, Russell Cattelan - * and Rajagopal Ananthanarayanan ("ananth") at SGI. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "page_buf.h" - -#define BBSHIFT 9 -#define BN_ALIGN_MASK ((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1) - -#ifndef GFP_READAHEAD -#define GFP_READAHEAD (__GFP_NOWARN|__GFP_NORETRY) -#endif - -/* - * File wide globals - */ - -STATIC kmem_cache_t *pagebuf_cache; -STATIC void pagebuf_daemon_wakeup(int); -STATIC void pagebuf_delwri_queue(page_buf_t *, int); -STATIC struct workqueue_struct *pagebuf_logio_workqueue; -STATIC struct workqueue_struct *pagebuf_dataio_workqueue; - -/* - * Pagebuf module configuration parameters, exported via - * /proc/sys/vm/pagebuf - */ - -typedef struct pb_sysctl_val { - int min; - int val; - int max; -} pb_sysctl_val_t; - -struct { - pb_sysctl_val_t flush_interval; /* interval between runs of the - * delwri flush daemon. */ - pb_sysctl_val_t age_buffer; /* time for buffer to age before - * we flush it. */ - pb_sysctl_val_t stats_clear; /* clear the pagebuf stats */ - pb_sysctl_val_t debug; /* debug tracing on or off */ -} pb_params = { - /* MIN DFLT MAX */ - .flush_interval = { HZ/2, HZ, 30*HZ }, - .age_buffer = { 1*HZ, 15*HZ, 300*HZ }, - .stats_clear = { 0, 0, 1 }, - .debug = { 0, 0, 1 }, -}; - -enum { - PB_FLUSH_INT = 1, - PB_FLUSH_AGE = 2, - PB_STATS_CLEAR = 3, - PB_DEBUG = 4, -}; - -/* - * Pagebuf statistics variables - */ - -struct pbstats { - u_int32_t pb_get; - u_int32_t pb_create; - u_int32_t pb_get_locked; - u_int32_t pb_get_locked_waited; - u_int32_t pb_busy_locked; - u_int32_t pb_miss_locked; - u_int32_t pb_page_retries; - u_int32_t pb_page_found; - u_int32_t pb_get_read; -} pbstats; -DEFINE_PER_CPU(struct pbstats, pbstats); - -/* We don't disable preempt, not too worried about poking the - * wrong cpu's stat for now */ -#define PB_STATS_INC(count) (__get_cpu_var(pbstats).count++) - -/* - * Pagebuf debugging - */ - -#ifdef PAGEBUF_TRACE -void -pagebuf_trace( - page_buf_t *pb, - char *id, - void *data, - void *ra) -{ - if (!pb_params.debug.val) - return; - ktrace_enter(pagebuf_trace_buf, - pb, id, - (void *)(unsigned long)pb->pb_flags, - (void *)(unsigned long)pb->pb_hold.counter, - (void *)(unsigned long)pb->pb_sema.count.counter, - (void *)current, - data, ra, - (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff), - (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff), - (void *)(unsigned long)pb->pb_buffer_length, - NULL, NULL, NULL, NULL, NULL); -} -ktrace_t *pagebuf_trace_buf; -EXPORT_SYMBOL(pagebuf_trace_buf); -#define PAGEBUF_TRACE_SIZE 4096 -#define PB_TRACE(pb, id, data) \ - pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0)) -#else -#define PB_TRACE(pb, id, data) do { } while (0) -#endif - -#ifdef PAGEBUF_LOCK_TRACKING -# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid) -# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1) -# define PB_GET_OWNER(pb) ((pb)->pb_last_holder) -#else -# define PB_SET_OWNER(pb) do { } while (0) -# define PB_CLEAR_OWNER(pb) do { } while (0) -# define PB_GET_OWNER(pb) do { } while (0) -#endif - -/* - * Pagebuf allocation / freeing. - */ - -#define pb_to_gfp(flags) \ - (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \ - ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) - -#define pagebuf_allocate(flags) \ - kmem_cache_alloc(pagebuf_cache, pb_to_gfp(flags)) -#define pagebuf_deallocate(pb) \ - kmem_cache_free(pagebuf_cache, (pb)); - -/* - * Pagebuf hashing - */ - -#define NBITS 8 -#define NHASH (1<pb_hash_index] - -STATIC int -_bhash( - struct block_device *bdev, - loff_t base) -{ - int bit, hval; - - base >>= 9; - base ^= (unsigned long)bdev / L1_CACHE_BYTES; - for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) { - hval ^= (int)base & (NHASH-1); - base >>= NBITS; - } - return hval; -} - -/* - * Mapping of multi-page buffers into contiguous virtual space - */ - -STATIC void *pagebuf_mapout_locked(page_buf_t *); - -typedef struct a_list { - void *vm_addr; - struct a_list *next; -} a_list_t; - -STATIC a_list_t *as_free_head; -STATIC int as_list_len; -STATIC spinlock_t as_lock = SPIN_LOCK_UNLOCKED; - -/* - * Try to batch vunmaps because they are costly. - */ -STATIC void -free_address( - void *addr) -{ - a_list_t *aentry; - - aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC); - if (aentry) { - spin_lock(&as_lock); - aentry->next = as_free_head; - aentry->vm_addr = addr; - as_free_head = aentry; - as_list_len++; - spin_unlock(&as_lock); - } else { - vunmap(addr); - } -} - -STATIC void -purge_addresses(void) -{ - a_list_t *aentry, *old; - - if (as_free_head == NULL) - return; - - spin_lock(&as_lock); - aentry = as_free_head; - as_free_head = NULL; - as_list_len = 0; - spin_unlock(&as_lock); - - while ((old = aentry) != NULL) { - vunmap(aentry->vm_addr); - aentry = aentry->next; - kfree(old); - } -} - -/* - * Internal pagebuf object manipulation - */ - -STATIC void -_pagebuf_initialize( - page_buf_t *pb, - pb_target_t *target, - loff_t range_base, - size_t range_length, - page_buf_flags_t flags) -{ - /* - * We don't want certain flags to appear in pb->pb_flags. - */ - flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD); - - memset(pb, 0, sizeof(page_buf_t)); - atomic_set(&pb->pb_hold, 1); - init_MUTEX_LOCKED(&pb->pb_iodonesema); - INIT_LIST_HEAD(&pb->pb_list); - INIT_LIST_HEAD(&pb->pb_hash_list); - init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */ - PB_SET_OWNER(pb); - pb->pb_target = target; - pb->pb_file_offset = range_base; - /* - * Set buffer_length and count_desired to the same value initially. - * IO routines should use count_desired, which will be the same in - * most cases but may be reset (e.g. XFS recovery). - */ - pb->pb_buffer_length = pb->pb_count_desired = range_length; - pb->pb_flags = flags | PBF_NONE; - pb->pb_bn = PAGE_BUF_DADDR_NULL; - atomic_set(&pb->pb_pin_count, 0); - init_waitqueue_head(&pb->pb_waiters); - - PB_STATS_INC(pb_create); - PB_TRACE(pb, "initialize", target); -} - -/* - * Allocate a page array capable of holding a specified number - * of pages, and point the page buf at it. - */ -STATIC int -_pagebuf_get_pages( - page_buf_t *pb, - int page_count, - page_buf_flags_t flags) -{ - int gpf_mask = pb_to_gfp(flags); - - /* Make sure that we have a page list */ - if (pb->pb_pages == NULL) { - pb->pb_offset = page_buf_poff(pb->pb_file_offset); - pb->pb_page_count = page_count; - if (page_count <= PB_PAGES) { - pb->pb_pages = pb->pb_page_array; - } else { - pb->pb_pages = kmalloc(sizeof(struct page *) * - page_count, gpf_mask); - if (pb->pb_pages == NULL) - return -ENOMEM; - } - memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); - } - return 0; -} - -/* - * Walk a pagebuf releasing all the pages contained within it. - */ -STATIC inline void -_pagebuf_freepages( - page_buf_t *pb) -{ - int buf_index; - - for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) { - struct page *page = pb->pb_pages[buf_index]; - - if (page) { - pb->pb_pages[buf_index] = NULL; - page_cache_release(page); - } - } -} - -/* - * _pagebuf_free_object - * - * _pagebuf_free_object releases the contents specified buffer. - * The modification state of any associated pages is left unchanged. - */ -void -_pagebuf_free_object( - pb_hash_t *hash, /* hash bucket for buffer */ - page_buf_t *pb) /* buffer to deallocate */ -{ - page_buf_flags_t pb_flags = pb->pb_flags; - - PB_TRACE(pb, "free_object", 0); - pb->pb_flags |= PBF_FREED; - - if (hash) { - if (!list_empty(&pb->pb_hash_list)) { - hash->pb_count--; - list_del_init(&pb->pb_hash_list); - } - spin_unlock(&hash->pb_hash_lock); - } - - if (!(pb_flags & PBF_FREED)) { - /* release any virtual mapping */ ; - if (pb->pb_flags & _PBF_ADDR_ALLOCATED) { - void *vaddr = pagebuf_mapout_locked(pb); - if (vaddr) { - free_address(vaddr); - } - } - - if (pb->pb_flags & _PBF_MEM_ALLOCATED) { - if (pb->pb_pages) { - /* release the pages in the address list */ - if ((pb->pb_pages[0]) && - (pb->pb_flags & _PBF_MEM_SLAB)) { - kfree(pb->pb_addr); - } else { - _pagebuf_freepages(pb); - } - if (pb->pb_pages != pb->pb_page_array) - kfree(pb->pb_pages); - pb->pb_pages = NULL; - } - pb->pb_flags &= ~(_PBF_MEM_ALLOCATED|_PBF_MEM_SLAB); - } - } - - pagebuf_deallocate(pb); -} - -/* - * _pagebuf_lookup_pages - * - * _pagebuf_lookup_pages finds all pages which match the buffer - * in question and the range of file offsets supplied, - * and builds the page list for the buffer, if the - * page list is not already formed or if not all of the pages are - * already in the list. Invalid pages (pages which have not yet been - * read in from disk) are assigned for any pages which are not found. - */ -STATIC int -_pagebuf_lookup_pages( - page_buf_t *pb, - struct address_space *aspace, - page_buf_flags_t flags) -{ - loff_t next_buffer_offset; - unsigned long page_count, pi, index; - struct page *page; - int gfp_mask, retry_count = 5, rval = 0; - int all_mapped, good_pages, nbytes; - unsigned int blocksize, sectorshift; - size_t size, offset; - - - /* For pagebufs where we want to map an address, do not use - * highmem pages - so that we do not need to use kmap resources - * to access the data. - * - * For pages where the caller has indicated there may be resource - * contention (e.g. called from a transaction) do not flush - * delalloc pages to obtain memory. - */ - - if (flags & PBF_READ_AHEAD) { - gfp_mask = GFP_READAHEAD; - retry_count = 0; - } else if (flags & PBF_DONT_BLOCK) { - gfp_mask = GFP_NOFS; - } else if (flags & PBF_MAPPABLE) { - gfp_mask = GFP_KERNEL; - } else { - gfp_mask = GFP_HIGHUSER; - } - - next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length; - - good_pages = page_count = (page_buf_btoc(next_buffer_offset) - - page_buf_btoct(pb->pb_file_offset)); - - if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) { - /* Bring pages forward in cache */ - for (pi = 0; pi < page_count; pi++) { - mark_page_accessed(pb->pb_pages[pi]); - } - if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) { - all_mapped = 1; - goto mapit; - } - return 0; - } - - /* Ensure pb_pages field has been initialised */ - rval = _pagebuf_get_pages(pb, page_count, flags); - if (rval) - return rval; - - rval = pi = 0; - blocksize = pb->pb_target->pbr_bsize; - sectorshift = pb->pb_target->pbr_sshift; - size = pb->pb_count_desired; - offset = pb->pb_offset; - - /* Enter the pages in the page list */ - index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT; - for (all_mapped = 1; pi < page_count; pi++, index++) { - if (pb->pb_pages[pi] == 0) { - retry: - page = find_or_create_page(aspace, index, gfp_mask); - if (!page) { - if (--retry_count > 0) { - PB_STATS_INC(pb_page_retries); - pagebuf_daemon_wakeup(1); - current->state = TASK_UNINTERRUPTIBLE; - schedule_timeout(10); - goto retry; - } - rval = -ENOMEM; - all_mapped = 0; - continue; - } - PB_STATS_INC(pb_page_found); - mark_page_accessed(page); - pb->pb_pages[pi] = page; - } else { - page = pb->pb_pages[pi]; - lock_page(page); - } - - nbytes = PAGE_CACHE_SIZE - offset; - if (nbytes > size) - nbytes = size; - size -= nbytes; - - if (!PageUptodate(page)) { - if (blocksize == PAGE_CACHE_SIZE) { - if (flags & PBF_READ) - pb->pb_locked = 1; - good_pages--; - } else if (!PagePrivate(page)) { - unsigned long i, range; - - /* - * In this case page->private holds a bitmap - * of uptodate sectors within the page - */ - ASSERT(blocksize < PAGE_CACHE_SIZE); - range = (offset + nbytes) >> sectorshift; - for (i = offset >> sectorshift; i < range; i++) - if (!test_bit(i, &page->private)) - break; - if (i != range) - good_pages--; - } else { - good_pages--; - } - } - offset = 0; - } - - if (!pb->pb_locked) { - for (pi = 0; pi < page_count; pi++) { - if (pb->pb_pages[pi]) - unlock_page(pb->pb_pages[pi]); - } - } - -mapit: - pb->pb_flags |= _PBF_MEM_ALLOCATED; - if (all_mapped) { - pb->pb_flags |= _PBF_ALL_PAGES_MAPPED; - - /* A single page buffer is always mappable */ - if (page_count == 1) { - pb->pb_addr = (caddr_t) - page_address(pb->pb_pages[0]) + pb->pb_offset; - pb->pb_flags |= PBF_MAPPED; - } else if (flags & PBF_MAPPED) { - if (as_list_len > 64) - purge_addresses(); - pb->pb_addr = vmap(pb->pb_pages, page_count, - VM_MAP, PAGE_KERNEL); - if (pb->pb_addr == NULL) - return -ENOMEM; - pb->pb_addr += pb->pb_offset; - pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED; - } - } - /* If some pages were found with data in them - * we are not in PBF_NONE state. - */ - if (good_pages != 0) { - pb->pb_flags &= ~(PBF_NONE); - if (good_pages != page_count) { - pb->pb_flags |= PBF_PARTIAL; - } - } - - PB_TRACE(pb, "lookup_pages", (long)good_pages); - - return rval; -} - -/* - * Finding and Reading Buffers - */ - -/* - * _pagebuf_find - * - * Looks up, and creates if absent, a lockable buffer for - * a given range of an inode. The buffer is returned - * locked. If other overlapping buffers exist, they are - * released before the new buffer is created and locked, - * which may imply that this call will block until those buffers - * are unlocked. No I/O is implied by this call. - */ -STATIC page_buf_t * -_pagebuf_find( /* find buffer for block */ - pb_target_t *target,/* target for block */ - loff_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ - page_buf_flags_t flags, /* PBF_TRYLOCK */ - page_buf_t *new_pb)/* newly allocated buffer */ -{ - loff_t range_base; - size_t range_length; - int hval; - pb_hash_t *h; - struct list_head *p; - page_buf_t *pb; - int not_locked; - - range_base = (ioff << BBSHIFT); - range_length = (isize << BBSHIFT); - - /* Ensure we never do IOs smaller than the sector size */ - BUG_ON(range_length < (1 << target->pbr_sshift)); - - /* Ensure we never do IOs that are not sector aligned */ - BUG_ON(range_base & (loff_t)target->pbr_smask); - - hval = _bhash(target->pbr_bdev, range_base); - h = &pbhash[hval]; - - spin_lock(&h->pb_hash_lock); - list_for_each(p, &h->pb_hash) { - pb = list_entry(p, page_buf_t, pb_hash_list); - - if ((target == pb->pb_target) && - (pb->pb_file_offset == range_base) && - (pb->pb_buffer_length == range_length)) { - if (pb->pb_flags & PBF_FREED) - break; - /* If we look at something bring it to the - * front of the list for next time - */ - list_del(&pb->pb_hash_list); - list_add(&pb->pb_hash_list, &h->pb_hash); - goto found; - } - } - - /* No match found */ - if (new_pb) { - _pagebuf_initialize(new_pb, target, range_base, - range_length, flags | _PBF_LOCKABLE); - new_pb->pb_hash_index = hval; - h->pb_count++; - list_add(&new_pb->pb_hash_list, &h->pb_hash); - } else { - PB_STATS_INC(pb_miss_locked); - } - - spin_unlock(&h->pb_hash_lock); - return (new_pb); - -found: - atomic_inc(&pb->pb_hold); - spin_unlock(&h->pb_hash_lock); - - /* Attempt to get the semaphore without sleeping, - * if this does not work then we need to drop the - * spinlock and do a hard attempt on the semaphore. - */ - not_locked = down_trylock(&pb->pb_sema); - if (not_locked) { - if (!(flags & PBF_TRYLOCK)) { - /* wait for buffer ownership */ - PB_TRACE(pb, "get_lock", 0); - pagebuf_lock(pb); - PB_STATS_INC(pb_get_locked_waited); - } else { - /* We asked for a trylock and failed, no need - * to look at file offset and length here, we - * know that this pagebuf at least overlaps our - * pagebuf and is locked, therefore our buffer - * either does not exist, or is this buffer - */ - - pagebuf_rele(pb); - PB_STATS_INC(pb_busy_locked); - return (NULL); - } - } else { - /* trylock worked */ - PB_SET_OWNER(pb); - } - - if (pb->pb_flags & PBF_STALE) - pb->pb_flags &= PBF_MAPPABLE | \ - PBF_MAPPED | \ - _PBF_LOCKABLE | \ - _PBF_ALL_PAGES_MAPPED | \ - _PBF_ADDR_ALLOCATED | \ - _PBF_MEM_ALLOCATED | \ - _PBF_MEM_SLAB; - PB_TRACE(pb, "got_lock", 0); - PB_STATS_INC(pb_get_locked); - return (pb); -} - - -/* - * pagebuf_find - * - * pagebuf_find returns a buffer matching the specified range of - * data for the specified target, if any of the relevant blocks - * are in memory. The buffer may have unallocated holes, if - * some, but not all, of the blocks are in memory. Even where - * pages are present in the buffer, not all of every page may be - * valid. - */ -page_buf_t * -pagebuf_find( /* find buffer for block */ - /* if the block is in memory */ - pb_target_t *target,/* target for block */ - loff_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ - page_buf_flags_t flags) /* PBF_TRYLOCK */ -{ - return _pagebuf_find(target, ioff, isize, flags, NULL); -} - -/* - * pagebuf_get - * - * pagebuf_get assembles a buffer covering the specified range. - * Some or all of the blocks in the range may be valid. Storage - * in memory for all portions of the buffer will be allocated, - * although backing storage may not be. If PBF_READ is set in - * flags, pagebuf_iostart is called also. - */ -page_buf_t * -pagebuf_get( /* allocate a buffer */ - pb_target_t *target,/* target for buffer */ - loff_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ - page_buf_flags_t flags) /* PBF_TRYLOCK */ -{ - page_buf_t *pb, *new_pb; - int error; - - new_pb = pagebuf_allocate(flags); - if (unlikely(!new_pb)) - return (NULL); - - pb = _pagebuf_find(target, ioff, isize, flags, new_pb); - if (pb != new_pb) { - pagebuf_deallocate(new_pb); - if (unlikely(!pb)) - return (NULL); - } - - PB_STATS_INC(pb_get); - - /* fill in any missing pages */ - error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags); - if (unlikely(error)) { - pagebuf_free(pb); - return (NULL); - } - - /* - * Always fill in the block number now, the mapped cases can do - * their own overlay of this later. - */ - pb->pb_bn = ioff; - pb->pb_count_desired = pb->pb_buffer_length; - - if (flags & PBF_READ) { - if (PBF_NOT_DONE(pb)) { - PB_TRACE(pb, "get_read", (unsigned long)flags); - PB_STATS_INC(pb_get_read); - pagebuf_iostart(pb, flags); - } else if (flags & PBF_ASYNC) { - PB_TRACE(pb, "get_read_async", (unsigned long)flags); - /* - * Read ahead call which is already satisfied, - * drop the buffer - */ - if (flags & (PBF_LOCK | PBF_TRYLOCK)) - pagebuf_unlock(pb); - pagebuf_rele(pb); - return NULL; - } else { - PB_TRACE(pb, "get_read_done", (unsigned long)flags); - /* We do not want read in the flags */ - pb->pb_flags &= ~PBF_READ; - } - } else { - PB_TRACE(pb, "get_write", (unsigned long)flags); - } - return (pb); -} - -/* - * Create a skeletal pagebuf (no pages associated with it). - */ -page_buf_t * -pagebuf_lookup( - struct pb_target *target, - loff_t ioff, - size_t isize, - page_buf_flags_t flags) -{ - page_buf_t *pb; - - pb = pagebuf_allocate(flags); - if (pb) { - _pagebuf_initialize(pb, target, ioff, isize, flags); - } - return pb; -} - -/* - * If we are not low on memory then do the readahead in a deadlock - * safe manner. - */ -void -pagebuf_readahead( - pb_target_t *target, - loff_t ioff, - size_t isize, - page_buf_flags_t flags) -{ - struct backing_dev_info *bdi; - - bdi = target->pbr_mapping->backing_dev_info; - if (bdi_read_congested(bdi)) - return; - if (bdi_write_congested(bdi)) - return; - - flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD); - pagebuf_get(target, ioff, isize, flags); -} - -page_buf_t * -pagebuf_get_empty( - size_t len, - pb_target_t *target) -{ - page_buf_t *pb; - - pb = pagebuf_allocate(_PBF_LOCKABLE); - if (pb) - _pagebuf_initialize(pb, target, 0, len, _PBF_LOCKABLE); - return pb; -} - -static inline struct page * -mem_to_page( - void *addr) -{ - if (((unsigned long)addr < VMALLOC_START) || - ((unsigned long)addr >= VMALLOC_END)) { - return virt_to_page(addr); - } else { - return vmalloc_to_page(addr); - } -} - -int -pagebuf_associate_memory( - page_buf_t *pb, - void *mem, - size_t len) -{ - int rval; - int i = 0; - size_t ptr; - size_t end, end_cur; - off_t offset; - int page_count; - - page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; - offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); - if (offset && (len > PAGE_CACHE_SIZE)) - page_count++; - - /* Free any previous set of page pointers */ - if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) { - kfree(pb->pb_pages); - } - pb->pb_pages = NULL; - pb->pb_addr = mem; - - rval = _pagebuf_get_pages(pb, page_count, 0); - if (rval) - return rval; - - pb->pb_offset = offset; - ptr = (size_t) mem & PAGE_CACHE_MASK; - end = PAGE_CACHE_ALIGN((size_t) mem + len); - end_cur = end; - /* set up first page */ - pb->pb_pages[0] = mem_to_page(mem); - - ptr += PAGE_CACHE_SIZE; - pb->pb_page_count = ++i; - while (ptr < end) { - pb->pb_pages[i] = mem_to_page((void *)ptr); - pb->pb_page_count = ++i; - ptr += PAGE_CACHE_SIZE; - } - pb->pb_locked = 0; - - pb->pb_count_desired = pb->pb_buffer_length = len; - pb->pb_flags |= PBF_MAPPED; - - return 0; -} - -page_buf_t * -pagebuf_get_no_daddr( - size_t len, - pb_target_t *target) -{ - int rval; - void *rmem = NULL; - page_buf_flags_t flags = _PBF_LOCKABLE | PBF_FORCEIO; - page_buf_t *pb; - size_t tlen = 0; - - if (unlikely(len > 0x20000)) - return NULL; - - pb = pagebuf_allocate(flags); - if (!pb) - return NULL; - - _pagebuf_initialize(pb, target, 0, len, flags); - - do { - if (tlen == 0) { - tlen = len; /* first time */ - } else { - kfree(rmem); /* free the mem from the previous try */ - tlen <<= 1; /* double the size and try again */ - } - if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) { - pagebuf_free(pb); - return NULL; - } - } while ((size_t)rmem != ((size_t)rmem & ~target->pbr_smask)); - - if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) { - kfree(rmem); - pagebuf_free(pb); - return NULL; - } - /* otherwise pagebuf_free just ignores it */ - pb->pb_flags |= (_PBF_MEM_ALLOCATED | _PBF_MEM_SLAB); - PB_CLEAR_OWNER(pb); - up(&pb->pb_sema); /* Return unlocked pagebuf */ - - PB_TRACE(pb, "no_daddr", rmem); - - return pb; -} - - -/* - * pagebuf_hold - * - * Increment reference count on buffer, to hold the buffer concurrently - * with another thread which may release (free) the buffer asynchronously. - * - * Must hold the buffer already to call this function. - */ -void -pagebuf_hold( - page_buf_t *pb) -{ - atomic_inc(&pb->pb_hold); - PB_TRACE(pb, "hold", 0); -} - -/* - * pagebuf_free - * - * pagebuf_free releases the specified buffer. The modification - * state of any associated pages is left unchanged. - */ -void -pagebuf_free( - page_buf_t *pb) -{ - if (pb->pb_flags & _PBF_LOCKABLE) { - pb_hash_t *h = pb_hash(pb); - - spin_lock(&h->pb_hash_lock); - _pagebuf_free_object(h, pb); - } else { - _pagebuf_free_object(NULL, pb); - } -} - -/* - * pagebuf_rele - * - * pagebuf_rele releases a hold on the specified buffer. If the - * the hold count is 1, pagebuf_rele calls pagebuf_free. - */ -void -pagebuf_rele( - page_buf_t *pb) -{ - pb_hash_t *h; - - PB_TRACE(pb, "rele", pb->pb_relse); - if (pb->pb_flags & _PBF_LOCKABLE) { - h = pb_hash(pb); - spin_lock(&h->pb_hash_lock); - } else { - h = NULL; - } - - if (atomic_dec_and_test(&pb->pb_hold)) { - int do_free = 1; - - if (pb->pb_relse) { - atomic_inc(&pb->pb_hold); - if (h) - spin_unlock(&h->pb_hash_lock); - (*(pb->pb_relse)) (pb); - do_free = 0; - } - if (pb->pb_flags & PBF_DELWRI) { - pb->pb_flags |= PBF_ASYNC; - atomic_inc(&pb->pb_hold); - if (h && do_free) - spin_unlock(&h->pb_hash_lock); - pagebuf_delwri_queue(pb, 0); - do_free = 0; - } else if (pb->pb_flags & PBF_FS_MANAGED) { - if (h) - spin_unlock(&h->pb_hash_lock); - do_free = 0; - } - - if (do_free) { - _pagebuf_free_object(h, pb); - } - } else if (h) { - spin_unlock(&h->pb_hash_lock); - } -} - - -/* - * Mutual exclusion on buffers. Locking model: - * - * Buffers associated with inodes for which buffer locking - * is not enabled are not protected by semaphores, and are - * assumed to be exclusively owned by the caller. There is a - * spinlock in the buffer, used by the caller when concurrent - * access is possible. - */ - -/* - * pagebuf_cond_lock - * - * pagebuf_cond_lock locks a buffer object, if it is not already locked. - * Note that this in no way - * locks the underlying pages, so it is only useful for synchronizing - * concurrent use of page buffer objects, not for synchronizing independent - * access to the underlying pages. - */ -int -pagebuf_cond_lock( /* lock buffer, if not locked */ - /* returns -EBUSY if locked) */ - page_buf_t *pb) -{ - int locked; - - ASSERT(pb->pb_flags & _PBF_LOCKABLE); - locked = down_trylock(&pb->pb_sema) == 0; - if (locked) { - PB_SET_OWNER(pb); - } - PB_TRACE(pb, "cond_lock", (long)locked); - return(locked ? 0 : -EBUSY); -} - -/* - * pagebuf_lock_value - * - * Return lock value for a pagebuf - */ -int -pagebuf_lock_value( - page_buf_t *pb) -{ - ASSERT(pb->pb_flags & _PBF_LOCKABLE); - return(atomic_read(&pb->pb_sema.count)); -} - -/* - * pagebuf_lock - * - * pagebuf_lock locks a buffer object. Note that this in no way - * locks the underlying pages, so it is only useful for synchronizing - * concurrent use of page buffer objects, not for synchronizing independent - * access to the underlying pages. - */ -int -pagebuf_lock( - page_buf_t *pb) -{ - ASSERT(pb->pb_flags & _PBF_LOCKABLE); - - PB_TRACE(pb, "lock", 0); - if (atomic_read(&pb->pb_io_remaining)) - blk_run_queues(); - down(&pb->pb_sema); - PB_SET_OWNER(pb); - PB_TRACE(pb, "locked", 0); - return 0; -} - -/* - * pagebuf_unlock - * - * pagebuf_unlock releases the lock on the buffer object created by - * pagebuf_lock or pagebuf_cond_lock (not any - * pinning of underlying pages created by pagebuf_pin). - */ -void -pagebuf_unlock( /* unlock buffer */ - page_buf_t *pb) /* buffer to unlock */ -{ - ASSERT(pb->pb_flags & _PBF_LOCKABLE); - PB_CLEAR_OWNER(pb); - up(&pb->pb_sema); - PB_TRACE(pb, "unlock", 0); -} - - -/* - * Pinning Buffer Storage in Memory - */ - -/* - * pagebuf_pin - * - * pagebuf_pin locks all of the memory represented by a buffer in - * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for - * the same or different buffers affecting a given page, will - * properly count the number of outstanding "pin" requests. The - * buffer may be released after the pagebuf_pin and a different - * buffer used when calling pagebuf_unpin, if desired. - * pagebuf_pin should be used by the file system when it wants be - * assured that no attempt will be made to force the affected - * memory to disk. It does not assure that a given logical page - * will not be moved to a different physical page. - */ -void -pagebuf_pin( - page_buf_t *pb) -{ - atomic_inc(&pb->pb_pin_count); - PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter); -} - -/* - * pagebuf_unpin - * - * pagebuf_unpin reverses the locking of memory performed by - * pagebuf_pin. Note that both functions affected the logical - * pages associated with the buffer, not the buffer itself. - */ -void -pagebuf_unpin( - page_buf_t *pb) -{ - if (atomic_dec_and_test(&pb->pb_pin_count)) { - wake_up_all(&pb->pb_waiters); - } - PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter); -} - -int -pagebuf_ispin( - page_buf_t *pb) -{ - return atomic_read(&pb->pb_pin_count); -} - -/* - * pagebuf_wait_unpin - * - * pagebuf_wait_unpin waits until all of the memory associated - * with the buffer is not longer locked in memory. It returns - * immediately if none of the affected pages are locked. - */ -static inline void -_pagebuf_wait_unpin( - page_buf_t *pb) -{ - DECLARE_WAITQUEUE (wait, current); - - if (atomic_read(&pb->pb_pin_count) == 0) - return; - - add_wait_queue(&pb->pb_waiters, &wait); - for (;;) { - current->state = TASK_UNINTERRUPTIBLE; - if (atomic_read(&pb->pb_pin_count) == 0) - break; - if (atomic_read(&pb->pb_io_remaining)) - blk_run_queues(); - schedule(); - } - remove_wait_queue(&pb->pb_waiters, &wait); - current->state = TASK_RUNNING; -} - -/* - * Buffer Utility Routines - */ - -/* - * pagebuf_iodone - * - * pagebuf_iodone marks a buffer for which I/O is in progress - * done with respect to that I/O. The pb_iodone routine, if - * present, will be called as a side-effect. - */ -void -pagebuf_iodone_work( - void *v) -{ - page_buf_t *pb = (page_buf_t *)v; - - if (pb->pb_iodone) { - (*(pb->pb_iodone)) (pb); - return; - } - - if (pb->pb_flags & PBF_ASYNC) { - if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse) - pagebuf_unlock(pb); - pagebuf_rele(pb); - } -} - -void -pagebuf_iodone( - page_buf_t *pb, - int dataio, - int schedule) -{ - pb->pb_flags &= ~(PBF_READ | PBF_WRITE); - if (pb->pb_error == 0) { - pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE); - } - - PB_TRACE(pb, "iodone", pb->pb_iodone); - - if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) { - if (schedule) { - INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb); - queue_work(dataio ? pagebuf_dataio_workqueue : - pagebuf_logio_workqueue, &pb->pb_iodone_work); - } else { - pagebuf_iodone_work(pb); - } - } else { - up(&pb->pb_iodonesema); - } -} - -/* - * pagebuf_ioerror - * - * pagebuf_ioerror sets the error code for a buffer. - */ -void -pagebuf_ioerror( /* mark/clear buffer error flag */ - page_buf_t *pb, /* buffer to mark */ - unsigned int error) /* error to store (0 if none) */ -{ - pb->pb_error = error; - PB_TRACE(pb, "ioerror", (unsigned long)error); -} - -/* - * pagebuf_iostart - * - * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied. - * If necessary, it will arrange for any disk space allocation required, - * and it will break up the request if the block mappings require it. - * The pb_iodone routine in the buffer supplied will only be called - * when all of the subsidiary I/O requests, if any, have been completed. - * pagebuf_iostart calls the pagebuf_ioinitiate routine or - * pagebuf_iorequest, if the former routine is not defined, to start - * the I/O on a given low-level request. - */ -int -pagebuf_iostart( /* start I/O on a buffer */ - page_buf_t *pb, /* buffer to start */ - page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */ - /* PBF_WRITE, PBF_DELWRI, */ - /* PBF_SYNC, PBF_DONT_BLOCK */ -{ - int status = 0; - - PB_TRACE(pb, "iostart", (unsigned long)flags); - - if (flags & PBF_DELWRI) { - pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC); - pb->pb_flags |= flags & - (PBF_DELWRI | PBF_ASYNC | PBF_SYNC); - pagebuf_delwri_queue(pb, 1); - return status; - } - - pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | \ - PBF_DELWRI | PBF_READ_AHEAD | PBF_RUN_QUEUES); - pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \ - PBF_SYNC | PBF_READ_AHEAD | PBF_RUN_QUEUES); - - BUG_ON(pb->pb_bn == PAGE_BUF_DADDR_NULL); - - /* For writes allow an alternate strategy routine to precede - * the actual I/O request (which may not be issued at all in - * a shutdown situation, for example). - */ - status = (flags & PBF_WRITE) ? - pagebuf_iostrategy(pb) : pagebuf_iorequest(pb); - - /* Wait for I/O if we are not an async request. - * Note: async I/O request completion will release the buffer, - * and that can already be done by this point. So using the - * buffer pointer from here on, after async I/O, is invalid. - */ - if (!status && !(flags & PBF_ASYNC)) - status = pagebuf_iowait(pb); - - return status; -} - -/* - * Helper routine for pagebuf_iorequest - */ - -STATIC __inline__ int -_pagebuf_iolocked( - page_buf_t *pb) -{ - ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE)); - if (pb->pb_flags & PBF_READ) - return pb->pb_locked; - return ((pb->pb_flags & _PBF_LOCKABLE) == 0); -} - -STATIC __inline__ void -_pagebuf_iodone( - page_buf_t *pb, - int schedule) -{ - if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { - pb->pb_locked = 0; - pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule); - } -} - -STATIC int -bio_end_io_pagebuf( - struct bio *bio, - unsigned int bytes_done, - int error) -{ - page_buf_t *pb = (page_buf_t *)bio->bi_private; - unsigned int i, blocksize = pb->pb_target->pbr_bsize; - unsigned int sectorshift = pb->pb_target->pbr_sshift; - struct bio_vec *bvec = bio->bi_io_vec; - - if (bio->bi_size) - return 1; - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - pb->pb_error = EIO; - - for (i = 0; i < bio->bi_vcnt; i++, bvec++) { - struct page *page = bvec->bv_page; - - if (pb->pb_error) { - SetPageError(page); - } else if (blocksize == PAGE_CACHE_SIZE) { - SetPageUptodate(page); - } else if (!PagePrivate(page)) { - unsigned int j, range; - - ASSERT(blocksize < PAGE_CACHE_SIZE); - range = (bvec->bv_offset + bvec->bv_len) >> sectorshift; - for (j = bvec->bv_offset >> sectorshift; j < range; j++) - set_bit(j, &page->private); - if (page->private == (unsigned long)(PAGE_CACHE_SIZE-1)) - SetPageUptodate(page); - } - - if (_pagebuf_iolocked(pb)) { - unlock_page(page); - } - } - - _pagebuf_iodone(pb, 1); - bio_put(bio); - return 0; -} - -void -_pagebuf_ioapply( - page_buf_t *pb) -{ - int i, map_i, total_nr_pages, nr_pages; - struct bio *bio; - int offset = pb->pb_offset; - int size = pb->pb_count_desired; - sector_t sector = pb->pb_bn; - unsigned int blocksize = pb->pb_target->pbr_bsize; - int locking = _pagebuf_iolocked(pb); - - total_nr_pages = pb->pb_page_count; - map_i = 0; - - /* Special code path for reading a sub page size pagebuf in -- - * we populate up the whole page, and hence the other metadata - * in the same page. This optimization is only valid when the - * filesystem block size and the page size are equal. - */ - if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) && - (pb->pb_flags & PBF_READ) && locking && - (blocksize == PAGE_CACHE_SIZE)) { - bio = bio_alloc(GFP_NOIO, 1); - - bio->bi_bdev = pb->pb_target->pbr_bdev; - bio->bi_sector = sector - (offset >> BBSHIFT); - bio->bi_end_io = bio_end_io_pagebuf; - bio->bi_private = pb; - - bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0); - size = 0; - - atomic_inc(&pb->pb_io_remaining); - - goto submit_io; - } - - /* Lock down the pages which we need to for the request */ - if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) { - for (i = 0; size; i++) { - int nbytes = PAGE_CACHE_SIZE - offset; - struct page *page = pb->pb_pages[i]; - - if (nbytes > size) - nbytes = size; - - lock_page(page); - - size -= nbytes; - offset = 0; - } - offset = pb->pb_offset; - size = pb->pb_count_desired; - } - -next_chunk: - atomic_inc(&pb->pb_io_remaining); - nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); - if (nr_pages > total_nr_pages) - nr_pages = total_nr_pages; - - bio = bio_alloc(GFP_NOIO, nr_pages); - bio->bi_bdev = pb->pb_target->pbr_bdev; - bio->bi_sector = sector; - bio->bi_end_io = bio_end_io_pagebuf; - bio->bi_private = pb; - - for (; size && nr_pages; nr_pages--, map_i++) { - int nbytes = PAGE_CACHE_SIZE - offset; - - if (nbytes > size) - nbytes = size; - - if (bio_add_page(bio, pb->pb_pages[map_i], - nbytes, offset) < nbytes) - break; - - offset = 0; - sector += nbytes >> BBSHIFT; - size -= nbytes; - total_nr_pages--; - } - -submit_io: - if (likely(bio->bi_size)) { - submit_bio((pb->pb_flags & PBF_READ) ? READ : WRITE, bio); - if (size) - goto next_chunk; - } else { - bio_put(bio); - pagebuf_ioerror(pb, EIO); - } - - if (pb->pb_flags & PBF_RUN_QUEUES) { - pb->pb_flags &= ~PBF_RUN_QUEUES; - if (atomic_read(&pb->pb_io_remaining) > 1) - blk_run_queues(); - } -} - -/* - * pagebuf_iorequest - * - * pagebuf_iorequest is the core I/O request routine. - * It assumes that the buffer is well-formed and - * mapped and ready for physical I/O, unlike - * pagebuf_iostart() and pagebuf_iophysio(). Those - * routines call the pagebuf_ioinitiate routine to start I/O, - * if it is present, or else call pagebuf_iorequest() - * directly if the pagebuf_ioinitiate routine is not present. - * - * This function will be responsible for ensuring access to the - * pages is restricted whilst I/O is in progress - for locking - * pagebufs the pagebuf lock is the mediator, for non-locking - * pagebufs the pages will be locked. In the locking case we - * need to use the pagebuf lock as multiple meta-data buffers - * will reference the same page. - */ -int -pagebuf_iorequest( /* start real I/O */ - page_buf_t *pb) /* buffer to convey to device */ -{ - PB_TRACE(pb, "iorequest", 0); - - if (pb->pb_flags & PBF_DELWRI) { - pagebuf_delwri_queue(pb, 1); - return 0; - } - - if (pb->pb_flags & PBF_WRITE) { - _pagebuf_wait_unpin(pb); - } - - pagebuf_hold(pb); - - /* Set the count to 1 initially, this will stop an I/O - * completion callout which happens before we have started - * all the I/O from calling pagebuf_iodone too early. - */ - atomic_set(&pb->pb_io_remaining, 1); - _pagebuf_ioapply(pb); - _pagebuf_iodone(pb, 0); - - pagebuf_rele(pb); - return 0; -} - -/* - * pagebuf_iowait - * - * pagebuf_iowait waits for I/O to complete on the buffer supplied. - * It returns immediately if no I/O is pending. In any case, it returns - * the error code, if any, or 0 if there is no error. - */ -int -pagebuf_iowait( - page_buf_t *pb) -{ - PB_TRACE(pb, "iowait", 0); - if (atomic_read(&pb->pb_io_remaining)) - blk_run_queues(); - down(&pb->pb_iodonesema); - PB_TRACE(pb, "iowaited", (long)pb->pb_error); - return pb->pb_error; -} - -STATIC void * -pagebuf_mapout_locked( - page_buf_t *pb) -{ - void *old_addr = NULL; - - if (pb->pb_flags & PBF_MAPPED) { - if (pb->pb_flags & _PBF_ADDR_ALLOCATED) - old_addr = pb->pb_addr - pb->pb_offset; - pb->pb_addr = NULL; - pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED); - } - - return old_addr; /* Caller must free the address space, - * we are under a spin lock, probably - * not safe to do vfree here - */ -} - -caddr_t -pagebuf_offset( - page_buf_t *pb, - size_t offset) -{ - struct page *page; - - offset += pb->pb_offset; - - page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT]; - return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1)); -} - -/* - * pagebuf_iomove - * - * Move data into or out of a buffer. - */ -void -pagebuf_iomove( - page_buf_t *pb, /* buffer to process */ - size_t boff, /* starting buffer offset */ - size_t bsize, /* length to copy */ - caddr_t data, /* data address */ - page_buf_rw_t mode) /* read/write flag */ -{ - size_t bend, cpoff, csize; - struct page *page; - - bend = boff + bsize; - while (boff < bend) { - page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)]; - cpoff = page_buf_poff(boff + pb->pb_offset); - csize = min_t(size_t, - PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff); - - ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); - - switch (mode) { - case PBRW_ZERO: - memset(page_address(page) + cpoff, 0, csize); - break; - case PBRW_READ: - memcpy(data, page_address(page) + cpoff, csize); - break; - case PBRW_WRITE: - memcpy(page_address(page) + cpoff, data, csize); - } - - boff += csize; - data += csize; - } -} - - -/* - * Pagebuf delayed write buffer handling - */ - -STATIC int pbd_active = 1; -STATIC LIST_HEAD(pbd_delwrite_queue); -STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED; - -STATIC void -pagebuf_delwri_queue( - page_buf_t *pb, - int unlock) -{ - PB_TRACE(pb, "delwri_q", (long)unlock); - spin_lock(&pbd_delwrite_lock); - /* If already in the queue, dequeue and place at tail */ - if (!list_empty(&pb->pb_list)) { - if (unlock) { - atomic_dec(&pb->pb_hold); - } - list_del(&pb->pb_list); - } - - list_add_tail(&pb->pb_list, &pbd_delwrite_queue); - pb->pb_flushtime = jiffies + pb_params.age_buffer.val; - spin_unlock(&pbd_delwrite_lock); - - if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) { - pagebuf_unlock(pb); - } -} - -void -pagebuf_delwri_dequeue( - page_buf_t *pb) -{ - PB_TRACE(pb, "delwri_uq", 0); - spin_lock(&pbd_delwrite_lock); - list_del_init(&pb->pb_list); - pb->pb_flags &= ~PBF_DELWRI; - spin_unlock(&pbd_delwrite_lock); -} - -STATIC void -pagebuf_runall_queues( - struct workqueue_struct *queue) -{ - flush_workqueue(queue); -} - -/* Defines for pagebuf daemon */ -DECLARE_WAIT_QUEUE_HEAD(pbd_waitq); -STATIC int force_flush; - -STATIC void -pagebuf_daemon_wakeup( - int flag) -{ - force_flush = flag; - if (waitqueue_active(&pbd_waitq)) { - wake_up_interruptible(&pbd_waitq); - } -} - -typedef void (*timeout_fn)(unsigned long); - -STATIC int -pagebuf_daemon( - void *data) -{ - int count; - page_buf_t *pb; - struct list_head *curr, *next, tmp; - struct timer_list pb_daemon_timer = - TIMER_INITIALIZER((timeout_fn)pagebuf_daemon_wakeup, 0, 0); - - /* Set up the thread */ - daemonize("pagebufd"); - - current->flags |= PF_MEMALLOC; - - INIT_LIST_HEAD(&tmp); - do { - /* swsusp */ - if (current->flags & PF_FREEZE) - refrigerator(PF_IOTHREAD); - - if (pbd_active == 1) { - mod_timer(&pb_daemon_timer, - jiffies + pb_params.flush_interval.val); - interruptible_sleep_on(&pbd_waitq); - } - - if (pbd_active == 0) { - del_timer_sync(&pb_daemon_timer); - } - - spin_lock(&pbd_delwrite_lock); - - count = 0; - list_for_each_safe(curr, next, &pbd_delwrite_queue) { - pb = list_entry(curr, page_buf_t, pb_list); - - PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb)); - - if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) && - (((pb->pb_flags & _PBF_LOCKABLE) == 0) || - !pagebuf_cond_lock(pb))) { - - if (!force_flush && - time_before(jiffies, pb->pb_flushtime)) { - pagebuf_unlock(pb); - break; - } - - pb->pb_flags &= ~PBF_DELWRI; - pb->pb_flags |= PBF_WRITE; - - list_del(&pb->pb_list); - list_add(&pb->pb_list, &tmp); - - count++; - } - } - - spin_unlock(&pbd_delwrite_lock); - while (!list_empty(&tmp)) { - pb = list_entry(tmp.next, page_buf_t, pb_list); - list_del_init(&pb->pb_list); - - pagebuf_iostrategy(pb); - } - - if (as_list_len > 0) - purge_addresses(); - if (count) - blk_run_queues(); - - force_flush = 0; - } while (pbd_active == 1); - - pbd_active = -1; - wake_up_interruptible(&pbd_waitq); - - return 0; -} - -void -pagebuf_delwri_flush( - pb_target_t *target, - u_long flags, - int *pinptr) -{ - page_buf_t *pb; - struct list_head *curr, *next, tmp; - int pincount = 0; - int flush_cnt = 0; - - pagebuf_runall_queues(pagebuf_dataio_workqueue); - pagebuf_runall_queues(pagebuf_logio_workqueue); - - spin_lock(&pbd_delwrite_lock); - INIT_LIST_HEAD(&tmp); - - list_for_each_safe(curr, next, &pbd_delwrite_queue) { - pb = list_entry(curr, page_buf_t, pb_list); - - /* - * Skip other targets, markers and in progress buffers - */ - - if ((pb->pb_flags == 0) || (pb->pb_target != target) || - !(pb->pb_flags & PBF_DELWRI)) { - continue; - } - - PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb)); - if (pagebuf_ispin(pb)) { - pincount++; - continue; - } - - pb->pb_flags &= ~PBF_DELWRI; - pb->pb_flags |= PBF_WRITE; - list_move(&pb->pb_list, &tmp); - } - /* ok found all the items that can be worked on - * drop the lock and process the private list */ - spin_unlock(&pbd_delwrite_lock); - - list_for_each_safe(curr, next, &tmp) { - pb = list_entry(curr, page_buf_t, pb_list); - - if (flags & PBDF_WAIT) - pb->pb_flags &= ~PBF_ASYNC; - else - list_del_init(curr); - - pagebuf_lock(pb); - pagebuf_iostrategy(pb); - if (++flush_cnt > 32) { - blk_run_queues(); - flush_cnt = 0; - } - } - - blk_run_queues(); - - while (!list_empty(&tmp)) { - pb = list_entry(tmp.next, page_buf_t, pb_list); - - list_del_init(&pb->pb_list); - pagebuf_iowait(pb); - if (!pb->pb_relse) - pagebuf_unlock(pb); - pagebuf_rele(pb); - } - - if (pinptr) - *pinptr = pincount; -} - -STATIC int -pagebuf_daemon_start(void) -{ - int rval; - - pagebuf_logio_workqueue = create_workqueue("xfslogd"); - if (!pagebuf_logio_workqueue) - return -ENOMEM; - - pagebuf_dataio_workqueue = create_workqueue("xfsdatad"); - if (!pagebuf_dataio_workqueue) { - destroy_workqueue(pagebuf_logio_workqueue); - return -ENOMEM; - } - - rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES); - if (rval < 0) { - destroy_workqueue(pagebuf_logio_workqueue); - destroy_workqueue(pagebuf_dataio_workqueue); - } - - return rval; -} - -/* - * pagebuf_daemon_stop - * - * Note: do not mark as __exit, it is called from pagebuf_terminate. - */ -STATIC void -pagebuf_daemon_stop(void) -{ - pbd_active = 0; - wake_up_interruptible(&pbd_waitq); - wait_event_interruptible(pbd_waitq, pbd_active); - destroy_workqueue(pagebuf_logio_workqueue); - destroy_workqueue(pagebuf_dataio_workqueue); -} - - -/* - * Pagebuf sysctl interface - */ - -STATIC int -pb_stats_clear_handler( - ctl_table *ctl, - int write, - struct file *filp, - void *buffer, - size_t *lenp) -{ - int c, ret; - int *valp = ctl->data; - - ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp); - - if (!ret && write && *valp) { - printk("XFS Clearing pbstats\n"); - for (c = 0; c < NR_CPUS; c++) { - if (!cpu_possible(c)) continue; - memset(&per_cpu(pbstats, c), 0, - sizeof(struct pbstats)); - } - pb_params.stats_clear.val = 0; - } - - return ret; -} - -STATIC struct ctl_table_header *pagebuf_table_header; - -STATIC ctl_table pagebuf_table[] = { - {PB_FLUSH_INT, "flush_int", &pb_params.flush_interval.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &pb_params.flush_interval.min, &pb_params.flush_interval.max}, - - {PB_FLUSH_AGE, "flush_age", &pb_params.age_buffer.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &pb_params.age_buffer.min, &pb_params.age_buffer.max}, - - {PB_STATS_CLEAR, "stats_clear", &pb_params.stats_clear.val, - sizeof(int), 0644, NULL, &pb_stats_clear_handler, - &sysctl_intvec, NULL, - &pb_params.stats_clear.min, &pb_params.stats_clear.max}, - -#ifdef PAGEBUF_TRACE - {PB_DEBUG, "debug", &pb_params.debug.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &pb_params.debug.min, &pb_params.debug.max}, -#endif - {0} -}; - -STATIC ctl_table pagebuf_dir_table[] = { - {VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table}, - {0} -}; - -STATIC ctl_table pagebuf_root_table[] = { - {CTL_VM, "vm", NULL, 0, 0555, pagebuf_dir_table}, - {0} -}; - -#ifdef CONFIG_PROC_FS -STATIC int -pagebuf_readstats( - char *buffer, - char **start, - off_t offset, - int count, - int *eof, - void *data) -{ - int c, i, len, val; - - len = 0; - len += sprintf(buffer + len, "pagebuf"); - for (i = 0; i < sizeof(struct pbstats) / sizeof(u_int32_t); i++) { - val = 0; - for (c = 0 ; c < NR_CPUS; c++) { - if (!cpu_possible(c)) continue; - val += *(((u_int32_t*)&per_cpu(pbstats, c) + i)); - } - len += sprintf(buffer + len, " %u", val); - } - buffer[len++] = '\n'; - - if (offset >= len) { - *start = buffer; - *eof = 1; - return 0; - } - *start = buffer + offset; - if ((len -= offset) > count) - return count; - *eof = 1; - - return len; -} -#endif /* CONFIG_PROC_FS */ - -/* - * Initialization and Termination - */ - -int __init -pagebuf_init(void) -{ - int i; - - pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1); - -#ifdef CONFIG_PROC_FS - if (proc_mkdir("fs/pagebuf", 0)) - create_proc_read_entry( - "fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL); -#endif - - pagebuf_cache = kmem_cache_create("page_buf_t", sizeof(page_buf_t), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (pagebuf_cache == NULL) { - printk("pagebuf: couldn't init pagebuf cache\n"); - pagebuf_terminate(); - return -ENOMEM; - } - - for (i = 0; i < NHASH; i++) { - spin_lock_init(&pbhash[i].pb_hash_lock); - INIT_LIST_HEAD(&pbhash[i].pb_hash); - } - -#ifdef PAGEBUF_TRACE - pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP); -#endif - - pagebuf_daemon_start(); - return 0; -} - - -/* - * pagebuf_terminate. - * - * Note: do not mark as __exit, this is also called from the __init code. - */ -void -pagebuf_terminate(void) -{ - pagebuf_daemon_stop(); - - kmem_cache_destroy(pagebuf_cache); - - unregister_sysctl_table(pagebuf_table_header); -#ifdef CONFIG_PROC_FS - remove_proc_entry("fs/pagebuf/stat", NULL); - remove_proc_entry("fs/pagebuf", NULL); -#endif -} - - -/* - * Module management (for kernel debugger module) - */ -EXPORT_SYMBOL(pagebuf_offset); -#ifdef DEBUG -EXPORT_SYMBOL(pbd_delwrite_queue); -#endif diff -Nru a/fs/xfs/pagebuf/page_buf.h b/fs/xfs/pagebuf/page_buf.h --- a/fs/xfs/pagebuf/page_buf.h Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,340 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ - -/* - * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI - */ - -#ifndef __PAGE_BUF_H__ -#define __PAGE_BUF_H__ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Base types - */ - -/* daddr must be signed since -1 is used for bmaps that are not yet allocated */ -typedef loff_t page_buf_daddr_t; - -#define PAGE_BUF_DADDR_NULL ((page_buf_daddr_t) (-1LL)) - -#define page_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) -#define page_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -#define page_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) -#define page_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) - -typedef enum page_buf_rw_e { - PBRW_READ = 1, /* transfer into target memory */ - PBRW_WRITE = 2, /* transfer from target memory */ - PBRW_ZERO = 3 /* Zero target memory */ -} page_buf_rw_t; - - -typedef enum page_buf_flags_e { /* pb_flags values */ - PBF_READ = (1 << 0), /* buffer intended for reading from device */ - PBF_WRITE = (1 << 1), /* buffer intended for writing to device */ - PBF_MAPPED = (1 << 2), /* buffer mapped (pb_addr valid) */ - PBF_PARTIAL = (1 << 3), /* buffer partially read */ - PBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ - PBF_NONE = (1 << 5), /* buffer not read at all */ - PBF_DELWRI = (1 << 6), /* buffer has dirty pages */ - PBF_FREED = (1 << 7), /* buffer has been freed and is invalid */ - PBF_SYNC = (1 << 8), /* force updates to disk */ - PBF_MAPPABLE = (1 << 9),/* use directly-addressable pages */ - PBF_STALE = (1 << 10), /* buffer has been staled, do not find it */ - PBF_FS_MANAGED = (1 << 11), /* filesystem controls freeing memory */ - PBF_FS_DATAIOD = (1 << 12), /* schedule IO completion on fs datad */ - - /* flags used only as arguments to access routines */ - PBF_LOCK = (1 << 13), /* lock requested */ - PBF_TRYLOCK = (1 << 14), /* lock requested, but do not wait */ - PBF_DONT_BLOCK = (1 << 15), /* do not block in current thread */ - - /* flags used only internally */ - _PBF_LOCKABLE = (1 << 16), /* page_buf_t may be locked */ - _PBF_PRIVATE_BH = (1 << 17), /* do not use public buffer heads */ - _PBF_ALL_PAGES_MAPPED = (1 << 18), /* all pages in range mapped */ - _PBF_ADDR_ALLOCATED = (1 << 19), /* pb_addr space was allocated */ - _PBF_MEM_ALLOCATED = (1 << 20), /* underlying pages are allocated */ - _PBF_MEM_SLAB = (1 << 21), /* underlying pages are slab allocated */ - - PBF_FORCEIO = (1 << 22), /* ignore any cache state */ - PBF_FLUSH = (1 << 23), /* flush disk write cache */ - PBF_READ_AHEAD = (1 << 24), /* asynchronous read-ahead */ - PBF_RUN_QUEUES = (1 << 25), /* run block device task queue */ - -} page_buf_flags_t; - -#define PBF_UPDATE (PBF_READ | PBF_WRITE) -#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0) -#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0) - -typedef struct pb_target { - dev_t pbr_dev; - struct block_device *pbr_bdev; - struct address_space *pbr_mapping; - unsigned int pbr_bsize; - unsigned int pbr_sshift; - size_t pbr_smask; -} pb_target_t; - -/* - * page_buf_t: Buffer structure for page cache-based buffers - * - * This buffer structure is used by the page cache buffer management routines - * to refer to an assembly of pages forming a logical buffer. The actual - * I/O is performed with buffer_head or bio structures, as required by drivers, - * for drivers which do not understand this structure. The buffer structure is - * used on temporary basis only, and discarded when released. - * - * The real data storage is recorded in the page cache. Metadata is - * hashed to the inode for the block device on which the file system resides. - * File data is hashed to the inode for the file. Pages which are only - * partially filled with data have bits set in their block_map entry - * to indicate which disk blocks in the page are not valid. - */ - -struct page_buf_s; -typedef void (*page_buf_iodone_t)(struct page_buf_s *); - /* call-back function on I/O completion */ -typedef void (*page_buf_relse_t)(struct page_buf_s *); - /* call-back function on I/O completion */ -typedef int (*page_buf_bdstrat_t)(struct page_buf_s *); - -#define PB_PAGES 4 - -typedef struct page_buf_s { - struct semaphore pb_sema; /* semaphore for lockables */ - unsigned long pb_flushtime; /* time to flush pagebuf */ - atomic_t pb_pin_count; /* pin count */ - wait_queue_head_t pb_waiters; /* unpin waiters */ - struct list_head pb_list; - page_buf_flags_t pb_flags; /* status flags */ - struct list_head pb_hash_list; - struct pb_target *pb_target; /* logical object */ - atomic_t pb_hold; /* reference count */ - page_buf_daddr_t pb_bn; /* block number for I/O */ - loff_t pb_file_offset; /* offset in file */ - size_t pb_buffer_length; /* size of buffer in bytes */ - size_t pb_count_desired; /* desired transfer size */ - void *pb_addr; /* virtual address of buffer */ - struct work_struct pb_iodone_work; - atomic_t pb_io_remaining;/* #outstanding I/O requests */ - page_buf_iodone_t pb_iodone; /* I/O completion function */ - page_buf_relse_t pb_relse; /* releasing function */ - page_buf_bdstrat_t pb_strat; /* pre-write function */ - struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */ - void *pb_fspriv; - void *pb_fspriv2; - void *pb_fspriv3; - unsigned short pb_error; /* error code on I/O */ - unsigned short pb_page_count; /* size of page array */ - unsigned short pb_offset; /* page offset in first page */ - unsigned char pb_locked; /* page array is locked */ - unsigned char pb_hash_index; /* hash table index */ - struct page **pb_pages; /* array of page pointers */ - struct page *pb_page_array[PB_PAGES]; /* inline pages */ -#ifdef PAGEBUF_LOCK_TRACKING - int pb_last_holder; -#endif -} page_buf_t; - - -/* Finding and Reading Buffers */ - -extern page_buf_t *pagebuf_find( /* find buffer for block if */ - /* the block is in memory */ - struct pb_target *, /* inode for block */ - loff_t, /* starting offset of range */ - size_t, /* length of range */ - page_buf_flags_t); /* PBF_LOCK */ - -extern page_buf_t *pagebuf_get( /* allocate a buffer */ - struct pb_target *, /* inode for buffer */ - loff_t, /* starting offset of range */ - size_t, /* length of range */ - page_buf_flags_t); /* PBF_LOCK, PBF_READ, */ - /* PBF_ASYNC */ - -extern page_buf_t *pagebuf_lookup( - struct pb_target *, - loff_t, /* starting offset of range */ - size_t, /* length of range */ - page_buf_flags_t); /* PBF_READ, PBF_WRITE, */ - /* PBF_FORCEIO, _PBF_LOCKABLE */ - -extern page_buf_t *pagebuf_get_empty( /* allocate pagebuf struct with */ - /* no memory or disk address */ - size_t len, - struct pb_target *); /* mount point "fake" inode */ - -extern page_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct */ - /* without disk address */ - size_t len, - struct pb_target *); /* mount point "fake" inode */ - -extern int pagebuf_associate_memory( - page_buf_t *, - void *, - size_t); - -extern void pagebuf_hold( /* increment reference count */ - page_buf_t *); /* buffer to hold */ - -extern void pagebuf_readahead( /* read ahead into cache */ - struct pb_target *, /* target for buffer (or NULL) */ - loff_t, /* starting offset of range */ - size_t, /* length of range */ - page_buf_flags_t); /* additional read flags */ - -/* Releasing Buffers */ - -extern void pagebuf_free( /* deallocate a buffer */ - page_buf_t *); /* buffer to deallocate */ - -extern void pagebuf_rele( /* release hold on a buffer */ - page_buf_t *); /* buffer to release */ - -/* Locking and Unlocking Buffers */ - -extern int pagebuf_cond_lock( /* lock buffer, if not locked */ - /* (returns -EBUSY if locked) */ - page_buf_t *); /* buffer to lock */ - -extern int pagebuf_lock_value( /* return count on lock */ - page_buf_t *); /* buffer to check */ - -extern int pagebuf_lock( /* lock buffer */ - page_buf_t *); /* buffer to lock */ - -extern void pagebuf_unlock( /* unlock buffer */ - page_buf_t *); /* buffer to unlock */ - -/* Buffer Read and Write Routines */ - -extern void pagebuf_iodone( /* mark buffer I/O complete */ - page_buf_t *, /* buffer to mark */ - int, /* use data/log helper thread. */ - int); /* run completion locally, or in - * a helper thread. */ - -extern void pagebuf_ioerror( /* mark buffer in error (or not) */ - page_buf_t *, /* buffer to mark */ - unsigned int); /* error to store (0 if none) */ - -extern int pagebuf_iostart( /* start I/O on a buffer */ - page_buf_t *, /* buffer to start */ - page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC, */ - /* PBF_READ, PBF_WRITE, */ - /* PBF_DELWRI, PBF_SYNC */ - -extern int pagebuf_iorequest( /* start real I/O */ - page_buf_t *); /* buffer to convey to device */ - -extern int pagebuf_iowait( /* wait for buffer I/O done */ - page_buf_t *); /* buffer to wait on */ - -extern void pagebuf_iomove( /* move data in/out of pagebuf */ - page_buf_t *, /* buffer to manipulate */ - size_t, /* starting buffer offset */ - size_t, /* length in buffer */ - caddr_t, /* data pointer */ - page_buf_rw_t); /* direction */ - -static inline int pagebuf_iostrategy(page_buf_t *pb) -{ - return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb); -} - -static inline int pagebuf_geterror(page_buf_t *pb) -{ - return pb ? pb->pb_error : ENOMEM; -} - -/* Buffer Utility Routines */ - -extern caddr_t pagebuf_offset( /* pointer at offset in buffer */ - page_buf_t *, /* buffer to offset into */ - size_t); /* offset */ - -/* Pinning Buffer Storage in Memory */ - -extern void pagebuf_pin( /* pin buffer in memory */ - page_buf_t *); /* buffer to pin */ - -extern void pagebuf_unpin( /* unpin buffered data */ - page_buf_t *); /* buffer to unpin */ - -extern int pagebuf_ispin( /* check if buffer is pinned */ - page_buf_t *); /* buffer to check */ - -/* Delayed Write Buffer Routines */ - -#define PBDF_WAIT 0x01 -extern void pagebuf_delwri_flush( - pb_target_t *, - unsigned long, - int *); - -extern void pagebuf_delwri_dequeue( - page_buf_t *); - -/* Buffer Daemon Setup Routines */ - -extern int pagebuf_init(void); -extern void pagebuf_terminate(void); - - -#ifdef PAGEBUF_TRACE -extern ktrace_t *pagebuf_trace_buf; -extern void pagebuf_trace( - page_buf_t *, /* buffer being traced */ - char *, /* description of operation */ - void *, /* arbitrary diagnostic value */ - void *); /* return address */ -#else -# define pagebuf_trace(pb, id, ptr, ra) do { } while (0) -#endif - -#define pagebuf_target_name(target) \ - ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; }) - -#endif /* __PAGE_BUF_H__ */ diff -Nru a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c --- a/fs/xfs/quota/xfs_qm.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/quota/xfs_qm.c Thu Jan 29 23:15:58 2004 @@ -78,7 +78,6 @@ */ mutex_t xfs_Gqm_lock; struct xfs_qm *xfs_Gqm; -EXPORT_SYMBOL(xfs_Gqm); /* used by xfsidbg */ kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; diff -Nru a/fs/xfs/support/kmem.h b/fs/xfs/support/kmem.h --- a/fs/xfs/support/kmem.h Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ -#ifndef __XFS_SUPPORT_KMEM_H__ -#define __XFS_SUPPORT_KMEM_H__ - -#include -#include -#include -#include - -/* - * Cutoff point to use vmalloc instead of kmalloc. - */ -#define MAX_SLAB_SIZE 0x10000 - -/* - * XFS uses slightly different names for these due to the - * IRIX heritage. - */ -#define kmem_zone kmem_cache_s -#define kmem_zone_t kmem_cache_t - -#define KM_SLEEP 0x0001 -#define KM_NOSLEEP 0x0002 -#define KM_NOFS 0x0004 - -typedef unsigned long xfs_pflags_t; - -#define PFLAGS_TEST_FSTRANS() (current->flags & PF_FSTRANS) - -#define PFLAGS_SET_FSTRANS(STATEP) do { \ - *(STATEP) = current->flags; \ - current->flags |= PF_FSTRANS; \ -} while (0) - -#define PFLAGS_RESTORE(STATEP) do { \ - current->flags = *(STATEP); \ -} while (0) - -#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \ - *(NSTATEP) = *(OSTATEP); \ -} while (0) - -/* - * XXX get rid of the unconditional __GFP_NOFAIL by adding - * a KM_FAIL flag and using it where we're allowed to fail. - */ -static __inline unsigned int -kmem_flags_convert(int flags) -{ - int lflags; - -#if DEBUG - if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS))) { - printk(KERN_WARNING - "XFS: memory allocation with wrong flags (%x)\n", flags); - BUG(); - } -#endif - - lflags = (flags & KM_NOSLEEP) ? GFP_ATOMIC : (GFP_KERNEL|__GFP_NOFAIL); - - /* avoid recusive callbacks to filesystem during transactions */ - if (PFLAGS_TEST_FSTRANS()) - lflags &= ~__GFP_FS; - - return lflags; -} - -static __inline void * -kmem_alloc(size_t size, int flags) -{ - if (unlikely(MAX_SLAB_SIZE < size)) - /* Avoid doing filesystem sensitive stuff to get this */ - return __vmalloc(size, kmem_flags_convert(flags), PAGE_KERNEL); - return kmalloc(size, kmem_flags_convert(flags)); -} - -static __inline void * -kmem_zalloc(size_t size, int flags) -{ - void *ptr = kmem_alloc(size, flags); - if (likely(ptr != NULL)) - memset(ptr, 0, size); - return ptr; -} - -static __inline void -kmem_free(void *ptr, size_t size) -{ - if (unlikely((unsigned long)ptr < VMALLOC_START || - (unsigned long)ptr >= VMALLOC_END)) - kfree(ptr); - else - vfree(ptr); -} - -static __inline void * -kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags) -{ - void *new = kmem_alloc(newsize, flags); - - if (likely(ptr != NULL)) { - if (likely(new != NULL)) - memcpy(new, ptr, min(oldsize, newsize)); - kmem_free(ptr, oldsize); - } - - return new; -} - -static __inline kmem_zone_t * -kmem_zone_init(int size, char *zone_name) -{ - return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL); -} - -static __inline void * -kmem_zone_alloc(kmem_zone_t *zone, int flags) -{ - return kmem_cache_alloc(zone, kmem_flags_convert(flags)); -} - -static __inline void * -kmem_zone_zalloc(kmem_zone_t *zone, int flags) -{ - void *ptr = kmem_zone_alloc(zone, flags); - if (likely(ptr != NULL)) - memset(ptr, 0, kmem_cache_size(zone)); - return ptr; -} - -static __inline void -kmem_zone_free(kmem_zone_t *zone, void *ptr) -{ - kmem_cache_free(zone, ptr); -} - -typedef struct shrinker *kmem_shaker_t; -typedef int (*kmem_shake_func_t)(int, unsigned int); - -static __inline kmem_shaker_t -kmem_shake_register(kmem_shake_func_t sfunc) -{ - return set_shrinker(DEFAULT_SEEKS, sfunc); -} - -static __inline void -kmem_shake_deregister(kmem_shaker_t shrinker) -{ - remove_shrinker(shrinker); -} - -static __inline int -kmem_shake_allow(unsigned int gfp_mask) -{ - return (gfp_mask & __GFP_WAIT); -} - -#endif /* __XFS_SUPPORT_KMEM_H__ */ diff -Nru a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c --- a/fs/xfs/support/ktrace.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/support/ktrace.c Thu Jan 29 23:15:58 2004 @@ -30,13 +30,12 @@ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ */ -#include #include #include #include -#include "kmem.h" -#include "spin.h" +#include +#include #include "debug.h" #include "ktrace.h" @@ -273,7 +272,6 @@ } return ktep; } -EXPORT_SYMBOL(ktrace_first); /* * ktrace_next() @@ -308,7 +306,6 @@ return ktep; } -EXPORT_SYMBOL(ktrace_next); /* * ktrace_skip() diff -Nru a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h --- a/fs/xfs/support/ktrace.h Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/support/ktrace.h Thu Jan 29 23:15:58 2004 @@ -32,7 +32,7 @@ #ifndef __XFS_SUPPORT_KTRACE_H__ #define __XFS_SUPPORT_KTRACE_H__ -#include +#include /* * Trace buffer entry structure. diff -Nru a/fs/xfs/support/mrlock.c b/fs/xfs/support/mrlock.c --- a/fs/xfs/support/mrlock.c Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,274 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ - -#include -#include -#include -#include -#include - -#include "mrlock.h" - - -#if USE_RW_WAIT_QUEUE_SPINLOCK -# define wq_write_lock write_lock -#else -# define wq_write_lock spin_lock -#endif - -/* - * We don't seem to need lock_type (only one supported), name, or - * sequence. But, XFS will pass it so let's leave them here for now. - */ -/* ARGSUSED */ -void -mrlock_init(mrlock_t *mrp, int lock_type, char *name, long sequence) -{ - mrp->mr_count = 0; - mrp->mr_reads_waiting = 0; - mrp->mr_writes_waiting = 0; - init_waitqueue_head(&mrp->mr_readerq); - init_waitqueue_head(&mrp->mr_writerq); - mrp->mr_lock = SPIN_LOCK_UNLOCKED; -} - -/* - * Macros to lock/unlock the mrlock_t. - */ - -#define MRLOCK(m) spin_lock(&(m)->mr_lock); -#define MRUNLOCK(m) spin_unlock(&(m)->mr_lock); - - -/* - * lock_wait should never be called in an interrupt thread. - * - * mrlocks can sleep (i.e. call schedule) and so they can't ever - * be called from an interrupt thread. - * - * threads that wake-up should also never be invoked from interrupt threads. - * - * But, waitqueue_lock is locked from interrupt threads - and we are - * called with interrupts disabled, so it is all OK. - */ - -/* ARGSUSED */ -void -lock_wait(wait_queue_head_t *q, spinlock_t *lock, int rw) -{ - DECLARE_WAITQUEUE( wait, current ); - - __set_current_state(TASK_UNINTERRUPTIBLE); - - spin_lock(&q->lock); - if (rw) { - __add_wait_queue_tail(q, &wait); - } else { - __add_wait_queue(q, &wait); - } - - spin_unlock(&q->lock); - spin_unlock(lock); - - schedule(); - - spin_lock(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock(&q->lock); - - spin_lock(lock); - - /* return with lock held */ -} - -/* ARGSUSED */ -void -mrfree(mrlock_t *mrp) -{ -} - -/* ARGSUSED */ -void -mrlock(mrlock_t *mrp, int type, int flags) -{ - if (type == MR_ACCESS) - mraccess(mrp); - else - mrupdate(mrp); -} - -/* ARGSUSED */ -void -mraccessf(mrlock_t *mrp, int flags) -{ - MRLOCK(mrp); - if(mrp->mr_writes_waiting > 0) { - mrp->mr_reads_waiting++; - lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0); - mrp->mr_reads_waiting--; - } - while (mrp->mr_count < 0) { - mrp->mr_reads_waiting++; - lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0); - mrp->mr_reads_waiting--; - } - mrp->mr_count++; - MRUNLOCK(mrp); -} - -/* ARGSUSED */ -void -mrupdatef(mrlock_t *mrp, int flags) -{ - MRLOCK(mrp); - while(mrp->mr_count) { - mrp->mr_writes_waiting++; - lock_wait(&mrp->mr_writerq, &mrp->mr_lock, 1); - mrp->mr_writes_waiting--; - } - - mrp->mr_count = -1; /* writer on it */ - MRUNLOCK(mrp); -} - -int -mrtryaccess(mrlock_t *mrp) -{ - MRLOCK(mrp); - /* - * If anyone is waiting for update access or the lock is held for update - * fail the request. - */ - if(mrp->mr_writes_waiting > 0 || mrp->mr_count < 0) { - MRUNLOCK(mrp); - return 0; - } - mrp->mr_count++; - MRUNLOCK(mrp); - return 1; -} - -int -mrtrypromote(mrlock_t *mrp) -{ - MRLOCK(mrp); - - if(mrp->mr_count == 1) { /* We are the only thread with the lock */ - mrp->mr_count = -1; /* writer on it */ - MRUNLOCK(mrp); - return 1; - } - - MRUNLOCK(mrp); - return 0; -} - -int -mrtryupdate(mrlock_t *mrp) -{ - MRLOCK(mrp); - - if(mrp->mr_count) { - MRUNLOCK(mrp); - return 0; - } - - mrp->mr_count = -1; /* writer on it */ - MRUNLOCK(mrp); - return 1; -} - -static __inline__ void mrwake(mrlock_t *mrp) -{ - /* - * First, if the count is now 0, we need to wake-up anyone waiting. - */ - if (!mrp->mr_count) { - if (mrp->mr_writes_waiting) { /* Wake-up first writer waiting */ - wake_up(&mrp->mr_writerq); - } else if (mrp->mr_reads_waiting) { /* Wakeup any readers waiting */ - wake_up(&mrp->mr_readerq); - } - } -} - -void -mraccunlock(mrlock_t *mrp) -{ - MRLOCK(mrp); - mrp->mr_count--; - mrwake(mrp); - MRUNLOCK(mrp); -} - -void -mrunlock(mrlock_t *mrp) -{ - MRLOCK(mrp); - if (mrp->mr_count < 0) { - mrp->mr_count = 0; - } else { - mrp->mr_count--; - } - mrwake(mrp); - MRUNLOCK(mrp); -} - -int -ismrlocked(mrlock_t *mrp, int type) /* No need to lock since info can change */ -{ - if (type == MR_ACCESS) - return (mrp->mr_count > 0); /* Read lock */ - else if (type == MR_UPDATE) - return (mrp->mr_count < 0); /* Write lock */ - else if (type == (MR_UPDATE | MR_ACCESS)) - return (mrp->mr_count); /* Any type of lock held */ - else /* Any waiters */ - return (mrp->mr_reads_waiting | mrp->mr_writes_waiting); -} - -/* - * Demote from update to access. We better be the only thread with the - * lock in update mode so it should be easy to set to 1. - * Wake-up any readers waiting. - */ - -void -mrdemote(mrlock_t *mrp) -{ - MRLOCK(mrp); - mrp->mr_count = 1; - if (mrp->mr_reads_waiting) { /* Wakeup all readers waiting */ - wake_up(&mrp->mr_readerq); - } - MRUNLOCK(mrp); -} diff -Nru a/fs/xfs/support/mrlock.h b/fs/xfs/support/mrlock.h --- a/fs/xfs/support/mrlock.h Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ -#ifndef __XFS_SUPPORT_MRLOCK_H__ -#define __XFS_SUPPORT_MRLOCK_H__ - -#include -#include -#include -#include - -/* - * Implement mrlocks on Linux that work for XFS. - * - * These are sleep locks and not spinlocks. If one wants read/write spinlocks, - * use read_lock, write_lock, ... see spinlock.h. - */ - -typedef struct mrlock_s { - int mr_count; - unsigned short mr_reads_waiting; - unsigned short mr_writes_waiting; - wait_queue_head_t mr_readerq; - wait_queue_head_t mr_writerq; - spinlock_t mr_lock; -} mrlock_t; - -#define MR_ACCESS 1 -#define MR_UPDATE 2 - -#define MRLOCK_BARRIER 0x1 -#define MRLOCK_ALLOW_EQUAL_PRI 0x8 - -/* - * mraccessf/mrupdatef take flags to be passed in while sleeping; - * only PLTWAIT is currently supported. - */ - -extern void mraccessf(mrlock_t *, int); -extern void mrupdatef(mrlock_t *, int); -extern void mrlock(mrlock_t *, int, int); -extern void mrunlock(mrlock_t *); -extern void mraccunlock(mrlock_t *); -extern int mrtryupdate(mrlock_t *); -extern int mrtryaccess(mrlock_t *); -extern int mrtrypromote(mrlock_t *); -extern void mrdemote(mrlock_t *); - -extern int ismrlocked(mrlock_t *, int); -extern void mrlock_init(mrlock_t *, int type, char *name, long sequence); -extern void mrfree(mrlock_t *); - -#define mrinit(mrp, name) mrlock_init(mrp, MRLOCK_BARRIER, name, -1) -#define mraccess(mrp) mraccessf(mrp, 0) /* grab for READ/ACCESS */ -#define mrupdate(mrp) mrupdatef(mrp, 0) /* grab for WRITE/UPDATE */ -#define mrislocked_access(mrp) ((mrp)->mr_count > 0) -#define mrislocked_update(mrp) ((mrp)->mr_count < 0) - -#endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff -Nru a/fs/xfs/support/mutex.h b/fs/xfs/support/mutex.h --- a/fs/xfs/support/mutex.h Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * Portions Copyright (c) 2002 Christoph Hellwig. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ -#ifndef __XFS_SUPPORT_MUTEX_H__ -#define __XFS_SUPPORT_MUTEX_H__ - -#include -#include - -/* - * Map the mutex'es from IRIX to Linux semaphores. - * - * Destroy just simply initializes to -99 which should block all other - * callers. - */ -#define MUTEX_DEFAULT 0x0 -typedef struct semaphore mutex_t; - -#define mutex_init(lock, type, name) sema_init(lock, 1) -#define mutex_destroy(lock) sema_init(lock, -99) -#define mutex_lock(lock, num) down(lock) -#define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1) -#define mutex_unlock(lock) up(lock) - -#endif /* __XFS_SUPPORT_MUTEX_H__ */ diff -Nru a/fs/xfs/support/sema.h b/fs/xfs/support/sema.h --- a/fs/xfs/support/sema.h Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ -#ifndef __XFS_SUPPORT_SEMA_H__ -#define __XFS_SUPPORT_SEMA_H__ - -#include -#include -#include -#include - -/* - * sema_t structure just maps to struct semaphore in Linux kernel. - */ - -typedef struct semaphore sema_t; - -#define init_sema(sp, val, c, d) sema_init(sp, val) -#define initsema(sp, val) sema_init(sp, val) -#define initnsema(sp, val, name) sema_init(sp, val) -#define psema(sp, b) down(sp) -#define vsema(sp) up(sp) -#define valusema(sp) (atomic_read(&(sp)->count)) -#define freesema(sema) - -/* - * Map cpsema (try to get the sema) to down_trylock. We need to switch - * the return values since cpsema returns 1 (acquired) 0 (failed) and - * down_trylock returns the reverse 0 (acquired) 1 (failed). - */ - -#define cpsema(sp) (down_trylock(sp) ? 0 : 1) - -/* - * Didn't do cvsema(sp). Not sure how to map this to up/down/... - * It does a vsema if the values is < 0 other wise nothing. - */ - -#endif /* __XFS_SUPPORT_SEMA_H__ */ diff -Nru a/fs/xfs/support/spin.h b/fs/xfs/support/spin.h --- a/fs/xfs/support/spin.h Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. - * Portions Copyright (c) 2002 Christoph Hellwig. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ -#ifndef __XFS_SUPPORT_SPIN_H__ -#define __XFS_SUPPORT_SPIN_H__ - -#include /* preempt needs this */ -#include - -/* - * Map lock_t from IRIX to Linux spinlocks. - * - * Note that linux turns on/off spinlocks depending on CONFIG_SMP. - * We don't need to worry about SMP or not here. - */ - -#define SPLDECL(s) unsigned long s - -typedef spinlock_t lock_t; - -#define spinlock_init(lock, name) spin_lock_init(lock) -#define spinlock_destroy(lock) - -static inline unsigned long mutex_spinlock(lock_t *lock) -{ - spin_lock(lock); - return 0; -} - -/*ARGSUSED*/ -static inline void mutex_spinunlock(lock_t *lock, unsigned long s) -{ - spin_unlock(lock); -} - -static inline void nested_spinlock(lock_t *lock) -{ - spin_lock(lock); -} - -static inline void nested_spinunlock(lock_t *lock) -{ - spin_unlock(lock); -} - -#endif /* __XFS_SUPPORT_SPIN_H__ */ diff -Nru a/fs/xfs/support/sv.h b/fs/xfs/support/sv.h --- a/fs/xfs/support/sv.h Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. - * Portions Copyright (c) 2002 Christoph Hellwig. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ -#ifndef __XFS_SUPPORT_SV_H__ -#define __XFS_SUPPORT_SV_H__ - -#include -#include -#include - -/* - * Synchronisation variables. - * - * (Parameters "pri", "svf" and "rts" are not implemented) - */ - -typedef struct sv_s { - wait_queue_head_t waiters; -} sv_t; - -#define SV_FIFO 0x0 /* sv_t is FIFO type */ -#define SV_LIFO 0x2 /* sv_t is LIFO type */ -#define SV_PRIO 0x4 /* sv_t is PRIO type */ -#define SV_KEYED 0x6 /* sv_t is KEYED type */ -#define SV_DEFAULT SV_FIFO - - -static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state, - unsigned long timeout) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue_exclusive(&sv->waiters, &wait); - __set_current_state(state); - spin_unlock(lock); - - schedule_timeout(timeout); - - remove_wait_queue(&sv->waiters, &wait); -} - -#define init_sv(sv,type,name,flag) \ - init_waitqueue_head(&(sv)->waiters) -#define sv_init(sv,flag,name) \ - init_waitqueue_head(&(sv)->waiters) -#define sv_destroy(sv) \ - /*NOTHING*/ -#define sv_wait(sv, pri, lock, s) \ - _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) -#define sv_wait_sig(sv, pri, lock, s) \ - _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) -#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \ - _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts)) -#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \ - _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts)) -#define sv_signal(sv) \ - wake_up(&(sv)->waiters) -#define sv_broadcast(sv) \ - wake_up_all(&(sv)->waiters) - -#endif /* __XFS_SUPPORT_SV_H__ */ diff -Nru a/fs/xfs/support/time.h b/fs/xfs/support/time.h --- a/fs/xfs/support/time.h Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ -#ifndef __XFS_SUPPORT_TIME_H__ -#define __XFS_SUPPORT_TIME_H__ - -#include -#include - -typedef struct timespec timespec_t; - -static inline void delay(long ticks) -{ - current->state = TASK_UNINTERRUPTIBLE; - schedule_timeout(ticks); -} - -static inline void nanotime(struct timespec *tvp) -{ - *tvp = CURRENT_TIME; -} - -#endif /* __XFS_SUPPORT_TIME_H__ */ diff -Nru a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c --- a/fs/xfs/support/uuid.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/support/uuid.c Thu Jan 29 23:15:58 2004 @@ -30,14 +30,7 @@ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ */ -#include -#include -#include -#include "time.h" -#include "uuid.h" -#include "kmem.h" -#include "debug.h" -#include "mutex.h" +#include static mutex_t uuid_monitor; static int uuid_table_size; diff -Nru a/fs/xfs/xfs.h b/fs/xfs/xfs.h --- a/fs/xfs/xfs.h Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs.h Thu Jan 29 23:15:58 2004 @@ -32,28 +32,8 @@ #ifndef __XFS_H__ #define __XFS_H__ -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include #include -#include #endif /* __XFS_H__ */ diff -Nru a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c --- a/fs/xfs/xfs_attr.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs_attr.c Thu Jan 29 23:15:58 2004 @@ -106,12 +106,11 @@ #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ #define ATTR_RMTVALUE_TRANSBLKS 8 /* max # of blks in a transaction */ -#if defined(DEBUG) +#if defined(XFS_ATTR_TRACE) ktrace_t *xfs_attr_trace_buf; #endif - /*======================================================================== * Overall external interface routines. *========================================================================*/ @@ -2589,6 +2588,14 @@ } STATIC int +attr_secure_capable( + struct vnode *vp, + cred_t *cred) +{ + return -ENOSECURITY; +} + +STATIC int attr_system_set( struct vnode *vp, char *name, void *data, size_t size, int xflags) { @@ -2651,6 +2658,16 @@ .attr_capable = attr_trusted_capable, }; +struct attrnames attr_secure = { + .attr_name = "security.", + .attr_namelen = sizeof("security.") - 1, + .attr_flag = ATTR_SECURE, + .attr_get = attr_generic_get, + .attr_set = attr_generic_set, + .attr_remove = attr_generic_remove, + .attr_capable = attr_secure_capable, +}; + struct attrnames attr_user = { .attr_name = "user.", .attr_namelen = sizeof("user.") - 1, @@ -2661,4 +2678,4 @@ }; struct attrnames *attr_namespaces[] = - { &attr_system, &attr_trusted, &attr_user }; + { &attr_system, &attr_trusted, &attr_secure, &attr_user }; diff -Nru a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h --- a/fs/xfs/xfs_attr.h Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs_attr.h Thu Jan 29 23:15:58 2004 @@ -69,8 +69,9 @@ attrcapable_t attr_capable; } attrnames_t; -#define ATTR_NAMECOUNT 3 +#define ATTR_NAMECOUNT 4 extern struct attrnames attr_user; +extern struct attrnames attr_secure; extern struct attrnames attr_system; extern struct attrnames attr_trusted; extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT]; @@ -86,6 +87,7 @@ #define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ #define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ #define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ +#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ #define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ #define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ #define ATTR_SYSTEM 0x0100 /* use attrs in system (pseudo) namespace */ diff -Nru a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c --- a/fs/xfs/xfs_attr_leaf.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs_attr_leaf.c Thu Jan 29 23:15:58 2004 @@ -159,6 +159,9 @@ continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((sfe->flags & XFS_ATTR_SECURE) != 0)) + continue; if (((args->flags & ATTR_ROOT) != 0) != ((sfe->flags & XFS_ATTR_ROOT) != 0)) continue; @@ -173,7 +176,8 @@ sfe->namelen = args->namelen; INT_SET(sfe->valuelen, ARCH_CONVERT, args->valuelen); - sfe->flags = (args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0; + sfe->flags = (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE : + ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0); memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); INT_MOD(sf->hdr.count, ARCH_CONVERT, 1); @@ -209,6 +213,9 @@ continue; if (memcmp(sfe->nameval, args->name, args->namelen) != 0) continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((sfe->flags & XFS_ATTR_SECURE) != 0)) + continue; if (((args->flags & ATTR_ROOT) != 0) != ((sfe->flags & XFS_ATTR_ROOT) != 0)) continue; @@ -253,6 +260,9 @@ continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((sfe->flags & XFS_ATTR_SECURE) != 0)) + continue; if (((args->flags & ATTR_ROOT) != 0) != ((sfe->flags & XFS_ATTR_ROOT) != 0)) continue; @@ -281,6 +291,9 @@ continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((sfe->flags & XFS_ATTR_SECURE) != 0)) + continue; if (((args->flags & ATTR_ROOT) != 0) != ((sfe->flags & XFS_ATTR_ROOT) != 0)) continue; @@ -369,7 +382,8 @@ nargs.valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT); nargs.hashval = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); - nargs.flags = (sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0; + nargs.flags = (sfe->flags & XFS_ATTR_SECURE) ? ATTR_SECURE : + ((sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0); error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == ENOATTR); error = xfs_attr_leaf_add(bp, &nargs); @@ -446,14 +460,15 @@ i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) { attrnames_t *namesp; - namesp = (sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user; if (((context->flags & ATTR_ROOT) != 0) != ((sfe->flags & XFS_ATTR_ROOT) != 0) && !(context->flags & ATTR_KERNFULLS)) { sfe = XFS_ATTR_SF_NEXTENTRY(sfe); continue; } + namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure: + ((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted : + &attr_user); if (context->flags & ATTR_KERNOVAL) { ASSERT(context->flags & ATTR_KERNAMELS); context->count += namesp->attr_namelen + @@ -548,8 +563,9 @@ for ( ; i < nsbuf; i++, sbp++) { attrnames_t *namesp; - namesp = (sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user; + namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure : + ((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted : + &attr_user); if (cursor->hashval != INT_GET(sbp->hash, ARCH_CONVERT)) { cursor->hashval = INT_GET(sbp->hash, ARCH_CONVERT); @@ -668,7 +684,8 @@ nargs.value = (char *)&name_loc->nameval[nargs.namelen]; nargs.valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT); nargs.hashval = INT_GET(entry->hashval, ARCH_CONVERT); - nargs.flags = (entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0; + nargs.flags = (entry->flags & XFS_ATTR_SECURE) ? ATTR_SECURE : + ((entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0); xfs_attr_shortform_add(&nargs); } error = 0; @@ -963,7 +980,8 @@ + INT_GET(map->size, ARCH_CONVERT)); INT_SET(entry->hashval, ARCH_CONVERT, args->hashval); entry->flags = tmp ? XFS_ATTR_LOCAL : 0; - entry->flags |= (args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0; + entry->flags |= (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE : + ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0); if (args->rename) { entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && @@ -1881,6 +1899,9 @@ if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0) continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((entry->flags & XFS_ATTR_SECURE) != 0)) + continue; if (((args->flags & ATTR_ROOT) != 0) != ((entry->flags & XFS_ATTR_ROOT) != 0)) continue; @@ -1893,6 +1914,9 @@ if (memcmp(args->name, (char *)name_rmt->name, args->namelen) != 0) continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((entry->flags & XFS_ATTR_SECURE) != 0)) + continue; if (((args->flags & ATTR_ROOT) != 0) != ((entry->flags & XFS_ATTR_ROOT) != 0)) continue; @@ -2290,8 +2314,9 @@ !(context->flags & ATTR_KERNFULLS)) continue; /* skip non-matching entries */ - namesp = (entry->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user; + namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure : + ((entry->flags & XFS_ATTR_ROOT) ? &attr_trusted : + &attr_user); if (entry->flags & XFS_ATTR_LOCAL) { name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); diff -Nru a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h --- a/fs/xfs/xfs_attr_leaf.h Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs_attr_leaf.h Thu Jan 29 23:15:58 2004 @@ -73,9 +73,9 @@ * to work "forw"ard. If none matches, continue with the "forw"ard leaf * nodes until the hash key changes or the attribute name is found. * - * We store the fact that an attribute is a ROOT versus USER attribute in + * We store the fact that an attribute is a ROOT/USER/SECURE attribute in * the leaf_entry. The namespaces are independent only because we also look - * at the root/user bit when we are looking for a matching attribute name. + * at the namespace bit when we are looking for a matching attribute name. * * We also store a "incomplete" bit in the leaf_entry. It shows that an * attribute is in the middle of being created and should not be shown to @@ -102,7 +102,7 @@ struct xfs_attr_leaf_entry { /* sorted on key, not name */ xfs_dahash_t hashval; /* hash value of name */ __uint16_t nameidx; /* index into buffer of name/value */ - __uint8_t flags; /* LOCAL, ROOT and INCOMPLETE flags */ + __uint8_t flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ __uint8_t pad2; /* unused pad byte */ } entries[1]; /* variable sized array */ struct xfs_attr_leaf_name_local { @@ -130,9 +130,11 @@ */ #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ +#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ #define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) #define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) #define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) /* diff -Nru a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/xfs_behavior.c Thu Jan 29 23:15:58 2004 @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + * + */ +#include "xfs.h" + +/* + * Source file used to associate/disassociate behaviors with virtualized + * objects. See xfs_behavior.h for more information about behaviors, etc. + * + * The implementation is split between functions in this file and macros + * in xfs_behavior.h. + */ + +/* + * Insert a new behavior descriptor into a behavior chain. + * + * The behavior chain is ordered based on the 'position' number which + * lives in the first field of the ops vector (higher numbers first). + * + * Attemps to insert duplicate ops result in an EINVAL return code. + * Otherwise, return 0 to indicate success. + */ +int +bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp) +{ + bhv_desc_t *curdesc, *prev; + int position; + + /* + * Validate the position value of the new behavior. + */ + position = BHV_POSITION(bdp); + ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP); + + /* + * Find location to insert behavior. Check for duplicates. + */ + prev = NULL; + for (curdesc = bhp->bh_first; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + /* Check for duplication. */ + if (curdesc->bd_ops == bdp->bd_ops) { + ASSERT(0); + return EINVAL; + } + + /* Find correct position */ + if (position >= BHV_POSITION(curdesc)) { + ASSERT(position != BHV_POSITION(curdesc)); + break; /* found it */ + } + + prev = curdesc; + } + + if (prev == NULL) { + /* insert at front of chain */ + bdp->bd_next = bhp->bh_first; + bhp->bh_first = bdp; + } else { + /* insert after prev */ + bdp->bd_next = prev->bd_next; + prev->bd_next = bdp; + } + + return 0; +} + +/* + * Remove a behavior descriptor from a position in a behavior chain; + * the postition is guaranteed not to be the first position. + * Should only be called by the bhv_remove() macro. + */ +void +bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp) +{ + bhv_desc_t *curdesc, *prev; + + ASSERT(bhp->bh_first != NULL); + ASSERT(bhp->bh_first->bd_next != NULL); + + prev = bhp->bh_first; + for (curdesc = bhp->bh_first->bd_next; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + if (curdesc == bdp) + break; /* found it */ + prev = curdesc; + } + + ASSERT(curdesc == bdp); + prev->bd_next = bdp->bd_next; /* remove from after prev */ +} + +/* + * Look for a specific ops vector on the specified behavior chain. + * Return the associated behavior descriptor. Or NULL, if not found. + */ +bhv_desc_t * +bhv_lookup(bhv_head_t *bhp, void *ops) +{ + bhv_desc_t *curdesc; + + for (curdesc = bhp->bh_first; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + if (curdesc->bd_ops == ops) + return curdesc; + } + + return NULL; +} + +/* + * Looks for the first behavior within a specified range of positions. + * Return the associated behavior descriptor. Or NULL, if none found. + */ +bhv_desc_t * +bhv_lookup_range(bhv_head_t *bhp, int low, int high) +{ + bhv_desc_t *curdesc; + + for (curdesc = bhp->bh_first; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + int position = BHV_POSITION(curdesc); + + if (position <= high) { + if (position >= low) + return curdesc; + return NULL; + } + } + + return NULL; +} + +/* + * Return the base behavior in the chain, or NULL if the chain + * is empty. + * + * The caller has not read locked the behavior chain, so acquire the + * lock before traversing the chain. + */ +bhv_desc_t * +bhv_base(bhv_head_t *bhp) +{ + bhv_desc_t *curdesc; + + for (curdesc = bhp->bh_first; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + if (curdesc->bd_next == NULL) { + return curdesc; + } + } + + return NULL; +} + +void +bhv_head_init( + bhv_head_t *bhp, + char *name) +{ + bhp->bh_first = NULL; +} + +void +bhv_insert_initial( + bhv_head_t *bhp, + bhv_desc_t *bdp) +{ + ASSERT(bhp->bh_first == NULL); + (bhp)->bh_first = bdp; +} + +void +bhv_head_destroy( + bhv_head_t *bhp) +{ + ASSERT(bhp->bh_first == NULL); +} diff -Nru a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/xfs_behavior.h Thu Jan 29 23:15:58 2004 @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BEHAVIOR_H__ +#define __XFS_BEHAVIOR_H__ + +/* + * Header file used to associate behaviors with virtualized objects. + * + * A virtualized object is an internal, virtualized representation of + * OS entities such as persistent files, processes, or sockets. Examples + * of virtualized objects include vnodes, vprocs, and vsockets. Often + * a virtualized object is referred to simply as an "object." + * + * A behavior is essentially an implementation layer associated with + * an object. Multiple behaviors for an object are chained together, + * the order of chaining determining the order of invocation. Each + * behavior of a given object implements the same set of interfaces + * (e.g., the VOP interfaces). + * + * Behaviors may be dynamically inserted into an object's behavior chain, + * such that the addition is transparent to consumers that already have + * references to the object. Typically, a given behavior will be inserted + * at a particular location in the behavior chain. Insertion of new + * behaviors is synchronized with operations-in-progress (oip's) so that + * the oip's always see a consistent view of the chain. + * + * The term "interpostion" is used to refer to the act of inserting + * a behavior such that it interposes on (i.e., is inserted in front + * of) a particular other behavior. A key example of this is when a + * system implementing distributed single system image wishes to + * interpose a distribution layer (providing distributed coherency) + * in front of an object that is otherwise only accessed locally. + * + * Note that the traditional vnode/inode combination is simply a virtualized + * object that has exactly one associated behavior. + * + * Behavior synchronization is logic which is necessary under certain + * circumstances that there is no conflict between ongoing operations + * traversing the behavior chain and those dunamically modifying the + * behavior chain. Because behavior synchronization adds extra overhead + * to virtual operation invocation, we want to restrict, as much as + * we can, the requirement for this extra code, to those situations + * in which it is truly necessary. + * + * Behavior synchronization is needed whenever there's at least one class + * of object in the system for which: + * 1) multiple behaviors for a given object are supported, + * -- AND -- + * 2a) insertion of a new behavior can happen dynamically at any time during + * the life of an active object, + * -- AND -- + * 3a) insertion of a new behavior needs to synchronize with existing + * ops-in-progress. + * -- OR -- + * 3b) multiple different behaviors can be dynamically inserted at + * any time during the life of an active object + * -- OR -- + * 3c) removal of a behavior can occur at any time during the life of + * an active object. + * -- OR -- + * 2b) removal of a behavior can occur at any time during the life of an + * active object + * + */ + +struct bhv_head_lock; + +/* + * Behavior head. Head of the chain of behaviors. + * Contained within each virtualized object data structure. + */ +typedef struct bhv_head { + struct bhv_desc *bh_first; /* first behavior in chain */ + struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */ +} bhv_head_t; + +/* + * Behavior descriptor. Descriptor associated with each behavior. + * Contained within the behavior's private data structure. + */ +typedef struct bhv_desc { + void *bd_pdata; /* private data for this behavior */ + void *bd_vobj; /* virtual object associated with */ + void *bd_ops; /* ops for this behavior */ + struct bhv_desc *bd_next; /* next behavior in chain */ +} bhv_desc_t; + +/* + * Behavior identity field. A behavior's identity determines the position + * where it lives within a behavior chain, and it's always the first field + * of the behavior's ops vector. The optional id field further identifies the + * subsystem responsible for the behavior. + */ +typedef struct bhv_identity { + __u16 bi_id; /* owning subsystem id */ + __u16 bi_position; /* position in chain */ +} bhv_identity_t; + +typedef bhv_identity_t bhv_position_t; + +#define BHV_IDENTITY_INIT(id,pos) {id, pos} +#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos) + +/* + * Define boundaries of position values. + */ +#define BHV_POSITION_INVALID 0 /* invalid position number */ +#define BHV_POSITION_BASE 1 /* base (last) implementation layer */ +#define BHV_POSITION_TOP 63 /* top (first) implementation layer */ + +/* + * Plumbing macros. + */ +#define BHV_HEAD_FIRST(bhp) (ASSERT((bhp)->bh_first), (bhp)->bh_first) +#define BHV_NEXT(bdp) (ASSERT((bdp)->bd_next), (bdp)->bd_next) +#define BHV_NEXTNULL(bdp) ((bdp)->bd_next) +#define BHV_VOBJ(bdp) (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj) +#define BHV_VOBJNULL(bdp) ((bdp)->bd_vobj) +#define BHV_PDATA(bdp) (bdp)->bd_pdata +#define BHV_OPS(bdp) (bdp)->bd_ops +#define BHV_IDENTITY(bdp) ((bhv_identity_t *)(bdp)->bd_ops) +#define BHV_POSITION(bdp) (BHV_IDENTITY(bdp)->bi_position) + +extern void bhv_head_init(bhv_head_t *, char *); +extern void bhv_head_destroy(bhv_head_t *); +extern int bhv_insert(bhv_head_t *, bhv_desc_t *); +extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *); + +/* + * Initialize a new behavior descriptor. + * Arguments: + * bdp - pointer to behavior descriptor + * pdata - pointer to behavior's private data + * vobj - pointer to associated virtual object + * ops - pointer to ops for this behavior + */ +#define bhv_desc_init(bdp, pdata, vobj, ops) \ + { \ + (bdp)->bd_pdata = pdata; \ + (bdp)->bd_vobj = vobj; \ + (bdp)->bd_ops = ops; \ + (bdp)->bd_next = NULL; \ + } + +/* + * Remove a behavior descriptor from a behavior chain. + */ +#define bhv_remove(bhp, bdp) \ + { \ + if ((bhp)->bh_first == (bdp)) { \ + /* \ + * Remove from front of chain. \ + * Atomic wrt oip's. \ + */ \ + (bhp)->bh_first = (bdp)->bd_next; \ + } else { \ + /* remove from non-front of chain */ \ + bhv_remove_not_first(bhp, bdp); \ + } \ + (bdp)->bd_vobj = NULL; \ + } + +/* + * Behavior module prototypes. + */ +extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp); +extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops); +extern bhv_desc_t * bhv_lookup_range(bhv_head_t *bhp, int low, int high); +extern bhv_desc_t * bhv_base(bhv_head_t *bhp); + +/* No bhv locking on Linux */ +#define bhv_lookup_unlocked bhv_lookup +#define bhv_base_unlocked bhv_base + +#endif /* __XFS_BEHAVIOR_H__ */ diff -Nru a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c --- a/fs/xfs/xfs_bmap_btree.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs_bmap_btree.c Thu Jan 29 23:15:58 2004 @@ -61,7 +61,7 @@ #include "xfs_error.h" #include "xfs_quota.h" -#ifdef DEBUG +#if defined(XFS_BMBT_TRACE) ktrace_t *xfs_bmbt_trace_buf; #endif diff -Nru a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h --- a/fs/xfs/xfs_buf.h Thu Jan 29 23:15:58 2004 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,308 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * Further, this software is distributed without any warranty that it is - * free of the rightful claim of any third person regarding infringement - * or the like. Any license provided herein, whether implied or - * otherwise, applies only to this software file. Patent licenses, if - * any, provided herein do not apply to combinations of this program with - * other software, or any other product whatsoever. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, - * Mountain View, CA 94043, or: - * - * http://www.sgi.com - * - * For further information regarding this notice, see: - * - * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ - */ -#ifndef __XFS_BUF_H__ -#define __XFS_BUF_H__ - -/* These are just for xfs_syncsub... it sets an internal variable - * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t - */ -#define XFS_B_ASYNC PBF_ASYNC -#define XFS_B_DELWRI PBF_DELWRI -#define XFS_B_READ PBF_READ -#define XFS_B_WRITE PBF_WRITE -#define XFS_B_STALE PBF_STALE - -#define XFS_BUF_TRYLOCK PBF_TRYLOCK -#define XFS_INCORE_TRYLOCK PBF_TRYLOCK -#define XFS_BUF_LOCK PBF_LOCK -#define XFS_BUF_MAPPED PBF_MAPPED - -#define BUF_BUSY PBF_DONT_BLOCK - -#define XFS_BUF_BFLAGS(x) ((x)->pb_flags) -#define XFS_BUF_ZEROFLAGS(x) \ - ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_DELWRI)) - -#define XFS_BUF_STALE(x) ((x)->pb_flags |= XFS_B_STALE) -#define XFS_BUF_UNSTALE(x) ((x)->pb_flags &= ~XFS_B_STALE) -#define XFS_BUF_ISSTALE(x) ((x)->pb_flags & XFS_B_STALE) -#define XFS_BUF_SUPER_STALE(x) do { \ - XFS_BUF_STALE(x); \ - xfs_buf_undelay(x); \ - XFS_BUF_DONE(x); \ - } while (0) - -#define XFS_BUF_MANAGE PBF_FS_MANAGED -#define XFS_BUF_UNMANAGE(x) ((x)->pb_flags &= ~PBF_FS_MANAGED) - -static inline void xfs_buf_undelay(page_buf_t *pb) -{ - if (pb->pb_flags & PBF_DELWRI) { - if (pb->pb_list.next != &pb->pb_list) { - pagebuf_delwri_dequeue(pb); - pagebuf_rele(pb); - } else { - pb->pb_flags &= ~PBF_DELWRI; - } - } -} - -#define XFS_BUF_DELAYWRITE(x) ((x)->pb_flags |= PBF_DELWRI) -#define XFS_BUF_UNDELAYWRITE(x) xfs_buf_undelay(x) -#define XFS_BUF_ISDELAYWRITE(x) ((x)->pb_flags & PBF_DELWRI) - -#define XFS_BUF_ERROR(x,no) pagebuf_ioerror(x,no) -#define XFS_BUF_GETERROR(x) pagebuf_geterror(x) -#define XFS_BUF_ISERROR(x) (pagebuf_geterror(x)?1:0) - -#define XFS_BUF_DONE(x) ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE)) -#define XFS_BUF_UNDONE(x) ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE) -#define XFS_BUF_ISDONE(x) (!(PBF_NOT_DONE(x))) - -#define XFS_BUF_BUSY(x) ((x)->pb_flags |= PBF_FORCEIO) -#define XFS_BUF_UNBUSY(x) ((x)->pb_flags &= ~PBF_FORCEIO) -#define XFS_BUF_ISBUSY(x) (1) - -#define XFS_BUF_ASYNC(x) ((x)->pb_flags |= PBF_ASYNC) -#define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC) -#define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC) - -#define XFS_BUF_FLUSH(x) ((x)->pb_flags |= PBF_FLUSH) -#define XFS_BUF_UNFLUSH(x) ((x)->pb_flags &= ~PBF_FLUSH) -#define XFS_BUF_ISFLUSH(x) ((x)->pb_flags & PBF_FLUSH) - -#define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n") -#define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n") -#define XFS_BUF_ISSHUT(x) (0) - -#define XFS_BUF_HOLD(x) pagebuf_hold(x) -#define XFS_BUF_READ(x) ((x)->pb_flags |= PBF_READ) -#define XFS_BUF_UNREAD(x) ((x)->pb_flags &= ~PBF_READ) -#define XFS_BUF_ISREAD(x) ((x)->pb_flags & PBF_READ) - -#define XFS_BUF_WRITE(x) ((x)->pb_flags |= PBF_WRITE) -#define XFS_BUF_UNWRITE(x) ((x)->pb_flags &= ~PBF_WRITE) -#define XFS_BUF_ISWRITE(x) ((x)->pb_flags & PBF_WRITE) - -#define XFS_BUF_ISUNINITIAL(x) (0) -#define XFS_BUF_UNUNINITIAL(x) (0) - -#define XFS_BUF_BP_ISMAPPED(bp) 1 - -typedef struct page_buf_s xfs_buf_t; -#define xfs_buf page_buf_s - -typedef struct pb_target xfs_buftarg_t; -#define xfs_buftarg pb_target - -#define XFS_BUF_DATAIO(x) ((x)->pb_flags |= PBF_FS_DATAIOD) -#define XFS_BUF_UNDATAIO(x) ((x)->pb_flags &= ~PBF_FS_DATAIOD) - -#define XFS_BUF_IODONE_FUNC(buf) (buf)->pb_iodone -#define XFS_BUF_SET_IODONE_FUNC(buf, func) \ - (buf)->pb_iodone = (func) -#define XFS_BUF_CLR_IODONE_FUNC(buf) \ - (buf)->pb_iodone = NULL -#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func) \ - (buf)->pb_strat = (func) -#define XFS_BUF_CLR_BDSTRAT_FUNC(buf) \ - (buf)->pb_strat = NULL - -#define XFS_BUF_FSPRIVATE(buf, type) \ - ((type)(buf)->pb_fspriv) -#define XFS_BUF_SET_FSPRIVATE(buf, value) \ - (buf)->pb_fspriv = (void *)(value) -#define XFS_BUF_FSPRIVATE2(buf, type) \ - ((type)(buf)->pb_fspriv2) -#define XFS_BUF_SET_FSPRIVATE2(buf, value) \ - (buf)->pb_fspriv2 = (void *)(value) -#define XFS_BUF_FSPRIVATE3(buf, type) \ - ((type)(buf)->pb_fspriv3) -#define XFS_BUF_SET_FSPRIVATE3(buf, value) \ - (buf)->pb_fspriv3 = (void *)(value) -#define XFS_BUF_SET_START(buf) - -#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \ - (buf)->pb_relse = (value) - -#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->pb_addr) - -extern inline xfs_caddr_t xfs_buf_offset(page_buf_t *bp, size_t offset) -{ - if (bp->pb_flags & PBF_MAPPED) - return XFS_BUF_PTR(bp) + offset; - return (xfs_caddr_t) pagebuf_offset(bp, offset); -} - -#define XFS_BUF_SET_PTR(bp, val, count) \ - pagebuf_associate_memory(bp, val, count) -#define XFS_BUF_ADDR(bp) ((bp)->pb_bn) -#define XFS_BUF_SET_ADDR(bp, blk) \ - ((bp)->pb_bn = (page_buf_daddr_t)(blk)) -#define XFS_BUF_OFFSET(bp) ((bp)->pb_file_offset) -#define XFS_BUF_SET_OFFSET(bp, off) \ - ((bp)->pb_file_offset = (off)) -#define XFS_BUF_COUNT(bp) ((bp)->pb_count_desired) -#define XFS_BUF_SET_COUNT(bp, cnt) \ - ((bp)->pb_count_desired = (cnt)) -#define XFS_BUF_SIZE(bp) ((bp)->pb_buffer_length) -#define XFS_BUF_SET_SIZE(bp, cnt) \ - ((bp)->pb_buffer_length = (cnt)) -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) -#define XFS_BUF_SET_VTYPE(bp, type) -#define XFS_BUF_SET_REF(bp, ref) - -#define XFS_BUF_ISPINNED(bp) pagebuf_ispin(bp) - -#define XFS_BUF_VALUSEMA(bp) pagebuf_lock_value(bp) -#define XFS_BUF_CPSEMA(bp) (pagebuf_cond_lock(bp) == 0) -#define XFS_BUF_VSEMA(bp) pagebuf_unlock(bp) -#define XFS_BUF_PSEMA(bp,x) pagebuf_lock(bp) -#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema); - -/* setup the buffer target from a buftarg structure */ -#define XFS_BUF_SET_TARGET(bp, target) \ - (bp)->pb_target = (target) -#define XFS_BUF_TARGET(bp) ((bp)->pb_target) -#define XFS_BUFTARG_NAME(target) \ - pagebuf_target_name(target) - -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) -#define XFS_BUF_SET_VTYPE(bp, type) -#define XFS_BUF_SET_REF(bp, ref) - -#define xfs_buf_read(target, blkno, len, flags) \ - pagebuf_get((target), (blkno), (len), \ - PBF_LOCK | PBF_READ | PBF_MAPPED | PBF_MAPPABLE) -#define xfs_buf_get(target, blkno, len, flags) \ - pagebuf_get((target), (blkno), (len), \ - PBF_LOCK | PBF_MAPPED | PBF_MAPPABLE) - -#define xfs_buf_read_flags(target, blkno, len, flags) \ - pagebuf_get((target), (blkno), (len), \ - PBF_READ | PBF_MAPPABLE | flags) -#define xfs_buf_get_flags(target, blkno, len, flags) \ - pagebuf_get((target), (blkno), (len), \ - PBF_MAPPABLE | flags) - -static inline int xfs_bawrite(void *mp, page_buf_t *bp) -{ - bp->pb_fspriv3 = mp; - bp->pb_strat = xfs_bdstrat_cb; - xfs_buf_undelay(bp); - return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | PBF_RUN_QUEUES); -} - -static inline void xfs_buf_relse(page_buf_t *bp) -{ - if ((bp->pb_flags & _PBF_LOCKABLE) && !bp->pb_relse) - pagebuf_unlock(bp); - pagebuf_rele(bp); -} - -#define xfs_bpin(bp) pagebuf_pin(bp) -#define xfs_bunpin(bp) pagebuf_unpin(bp) - -#define xfs_buftrace(id, bp) \ - pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0)) - -#define xfs_biodone(pb) \ - pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), 0) - -#define xfs_incore(buftarg,blkno,len,lockit) \ - pagebuf_find(buftarg, blkno ,len, lockit) - - -#define xfs_biomove(pb, off, len, data, rw) \ - pagebuf_iomove((pb), (off), (len), (data), \ - ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ) - -#define xfs_biozero(pb, off, len) \ - pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO) - - -static inline int XFS_bwrite(page_buf_t *pb) -{ - int iowait = (pb->pb_flags & PBF_ASYNC) == 0; - int error = 0; - - pb->pb_flags |= PBF_SYNC; - if (!iowait) - pb->pb_flags |= PBF_RUN_QUEUES; - - xfs_buf_undelay(pb); - pagebuf_iostrategy(pb); - if (iowait) { - error = pagebuf_iowait(pb); - xfs_buf_relse(pb); - } - return error; -} - -#define XFS_bdwrite(pb) \ - pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC) - -static inline int xfs_bdwrite(void *mp, page_buf_t *bp) -{ - bp->pb_strat = xfs_bdstrat_cb; - bp->pb_fspriv3 = mp; - - return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC); -} - -#define XFS_bdstrat(bp) pagebuf_iorequest(bp) - -#define xfs_iowait(pb) pagebuf_iowait(pb) - - -/* - * Go through all incore buffers, and release buffers - * if they belong to the given device. This is used in - * filesystem error handling to preserve the consistency - * of its metadata. - */ - -#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg) - -#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg) - -#define xfs_incore_relse(buftarg,delwri_only,wait) \ - xfs_relse_buftarg(buftarg) - -#define xfs_baread(target, rablkno, ralen) \ - pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK) - -#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target)) -#define xfs_buf_get_noaddr(len, target) pagebuf_get_no_daddr((len), (target)) -#define xfs_buf_free(bp) pagebuf_free(bp) - -#endif /* __XFS_BUF_H__ */ diff -Nru a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c --- a/fs/xfs/xfs_dir.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs_dir.c Thu Jan 29 23:15:58 2004 @@ -162,7 +162,7 @@ xfs_dir_put_t put); STATIC int xfs_dir_node_replace(xfs_da_args_t *args); -#if defined(DEBUG) +#if defined(XFS_DIR_TRACE) ktrace_t *xfs_dir_trace_buf; #endif diff -Nru a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c --- a/fs/xfs/xfs_dir2_trace.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs_dir2_trace.c Thu Jan 29 23:15:58 2004 @@ -49,11 +49,9 @@ #include "xfs_da_btree.h" #include "xfs_dir2_trace.h" -#ifdef DEBUG +#ifdef XFS_DIR2_TRACE ktrace_t *xfs_dir2_trace_buf; -#endif /* DEBUG */ -#ifdef XFS_DIR2_TRACE /* * Enter something in the trace buffers. */ diff -Nru a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/xfs/xfs_iomap.c Thu Jan 29 23:15:58 2004 @@ -0,0 +1,795 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_space.h" +#include "xfs_utils.h" +#include "xfs_iomap.h" + +#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ + << mp->m_writeio_log) +#define XFS_STRAT_WRITE_IMAPS 2 +#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP + +STATIC int +xfs_imap_to_bmap( + xfs_iocore_t *io, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + xfs_iomap_t *iomapp, + int imaps, /* Number of imap entries */ + int iomaps, /* Number of iomap entries */ + int flags) +{ + xfs_mount_t *mp; + xfs_fsize_t nisize; + int pbm; + xfs_fsblock_t start_block; + + mp = io->io_mount; + nisize = XFS_SIZE(mp, io); + if (io->io_new_size > nisize) + nisize = io->io_new_size; + + for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) { + iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ? + mp->m_rtdev_targp : mp->m_ddev_targp; + iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomapp->iomap_delta = offset - iomapp->iomap_offset; + iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomapp->iomap_flags = flags; + + start_block = imap->br_startblock; + if (start_block == HOLESTARTBLOCK) { + iomapp->iomap_bn = IOMAP_DADDR_NULL; + iomapp->iomap_flags = IOMAP_HOLE; + } else if (start_block == DELAYSTARTBLOCK) { + iomapp->iomap_bn = IOMAP_DADDR_NULL; + iomapp->iomap_flags = IOMAP_DELAY; + } else { + iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block); + if (ISUNWRITTEN(imap)) + iomapp->iomap_flags |= IOMAP_UNWRITTEN; + } + + if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) { + iomapp->iomap_flags |= IOMAP_EOF; + } + + offset += iomapp->iomap_bsize - iomapp->iomap_delta; + } + return pbm; /* Return the number filled */ +} + +int +xfs_iomap( + xfs_iocore_t *io, + xfs_off_t offset, + ssize_t count, + int flags, + xfs_iomap_t *iomapp, + int *niomaps) +{ + xfs_mount_t *mp = io->io_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; + xfs_bmbt_irec_t imap; + int nimaps = 1; + int bmapi_flags = 0; + int iomap_flags = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + switch (flags & + (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE | + BMAPI_UNWRITTEN | BMAPI_DEVICE)) { + case BMAPI_READ: + lockmode = XFS_LCK_MAP_SHARED(mp, io); + bmapi_flags = XFS_BMAPI_ENTIRE; + if (flags & BMAPI_IGNSTATE) + bmapi_flags |= XFS_BMAPI_IGSTATE; + break; + case BMAPI_WRITE: + lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR; + bmapi_flags = 0; + XFS_ILOCK(mp, io, lockmode); + break; + case BMAPI_ALLOCATE: + lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD; + bmapi_flags = XFS_BMAPI_ENTIRE; + /* Attempt non-blocking lock */ + if (flags & BMAPI_TRYLOCK) { + if (!XFS_ILOCK_NOWAIT(mp, io, lockmode)) + return XFS_ERROR(EAGAIN); + } else { + XFS_ILOCK(mp, io, lockmode); + } + break; + case BMAPI_UNWRITTEN: + goto phase2; + case BMAPI_DEVICE: + lockmode = XFS_LCK_MAP_SHARED(mp, io); + iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ? + mp->m_rtdev_targp : mp->m_ddev_targp; + error = 0; + *niomaps = 1; + goto out; + default: + BUG(); + } + + ASSERT(offset <= mp->m_maxioffset); + if ((xfs_fsize_t)offset + count > mp->m_maxioffset) + count = mp->m_maxioffset - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + error = XFS_BMAPI(mp, NULL, io, offset_fsb, + (xfs_filblks_t)(end_fsb - offset_fsb), + bmapi_flags, NULL, 0, &imap, + &nimaps, NULL); + + if (error) + goto out; + +phase2: + switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) { + case BMAPI_WRITE: + /* If we found an extent, return it */ + if (nimaps && (imap.br_startblock != HOLESTARTBLOCK)) + break; + + if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { + error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset, + count, flags, &imap, &nimaps, nimaps); + } else { + error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count, + flags, &imap, &nimaps); + } + iomap_flags = IOMAP_NEW; + break; + case BMAPI_ALLOCATE: + /* If we found an extent, return it */ + XFS_IUNLOCK(mp, io, lockmode); + lockmode = 0; + + if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) + break; + + error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, &imap, &nimaps); + break; + case BMAPI_UNWRITTEN: + lockmode = 0; + error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count); + nimaps = 0; + break; + } + + if (nimaps) { + *niomaps = xfs_imap_to_bmap(io, offset, &imap, + iomapp, nimaps, *niomaps, iomap_flags); + } else if (niomaps) { + *niomaps = 0; + } + +out: + if (lockmode) + XFS_IUNLOCK(mp, io, lockmode); + return XFS_ERROR(error); +} + +STATIC int +xfs_flush_space( + xfs_inode_t *ip, + int *fsynced, + int *ioflags) +{ + switch (*fsynced) { + case 0: + if (ip->i_delayed_blks) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_flush_inode(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + *fsynced = 1; + } else { + *ioflags |= BMAPI_SYNC; + *fsynced = 2; + } + return 0; + case 1: + *fsynced = 2; + *ioflags |= BMAPI_SYNC; + return 0; + case 2: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_flush_device(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + *fsynced = 3; + return 0; + } + return 1; +} + +int +xfs_iomap_write_direct( + xfs_inode_t *ip, + loff_t offset, + size_t count, + int flags, + xfs_bmbt_irec_t *ret_imap, + int *nmaps, + int found) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_iocore_t *io = &ip->i_iocore; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t count_fsb; + xfs_fsize_t isize; + xfs_fsblock_t firstfsb; + int nimaps, maps; + int error; + int bmapi_flag; + int rt; + xfs_trans_t *tp; + xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp; + xfs_bmap_free_t free_list; + int aeof; + xfs_filblks_t datablocks; + int committed; + int numrtextents; + uint resblks; + + /* + * Make sure that the dquots are there. This doesn't hold + * the ilock across a disk read. + */ + + error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED); + if (error) + return XFS_ERROR(error); + + maps = min(XFS_WRITE_IMAPS, *nmaps); + nimaps = maps; + + isize = ip->i_d.di_size; + aeof = (offset + count) > isize; + + if (io->io_new_size > isize) + isize = io->io_new_size; + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + count_fsb = last_fsb - offset_fsb; + if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) { + xfs_fileoff_t map_last_fsb; + + map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff; + + if (map_last_fsb < last_fsb) { + last_fsb = map_last_fsb; + count_fsb = last_fsb - offset_fsb; + } + ASSERT(count_fsb > 0); + } + + /* + * determine if reserving space on + * the data or realtime partition. + */ + if ((rt = XFS_IS_REALTIME_INODE(ip))) { + int sbrtextsize, iprtextsize; + + sbrtextsize = mp->m_sb.sb_rextsize; + iprtextsize = + ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize; + numrtextents = (count_fsb + iprtextsize - 1); + do_div(numrtextents, sbrtextsize); + datablocks = 0; + } else { + datablocks = count_fsb; + numrtextents = 0; + } + + /* + * allocate and setup the transaction + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + + resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks); + + error = xfs_trans_reserve(tp, resblks, + XFS_WRITE_LOG_RES(mp), numrtextents, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + + /* + * check for running out of space + */ + if (error) + /* + * Free the transaction structure. + */ + xfs_trans_cancel(tp, 0); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (error) + goto error_out; /* Don't return in above if .. trans .., + need lock to return */ + + if (XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, resblks)) { + error = (EDQUOT); + goto error1; + } + nimaps = 1; + + bmapi_flag = XFS_BMAPI_WRITE; + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt)) + bmapi_flag |= XFS_BMAPI_PREALLOC; + + /* + * issue the bmapi() call to allocate the blocks + */ + XFS_BMAP_INIT(&free_list, &firstfsb); + imapp = &imap[0]; + error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, + bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + + error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + if (error) { + goto error_out; + } + + /* copy any maps to caller's array and return any error. */ + if (nimaps == 0) { + error = (ENOSPC); + goto error_out; + } + + *ret_imap = imap[0]; + *nmaps = 1; + return 0; + + error0: /* Cancel bmap, unlock inode, and cancel trans */ + xfs_bmap_cancel(&free_list); + + error1: /* Just cancel transaction */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + *nmaps = 0; /* nothing set-up here */ + +error_out: + return XFS_ERROR(error); +} + +int +xfs_iomap_write_delay( + xfs_inode_t *ip, + loff_t offset, + size_t count, + int ioflag, + xfs_bmbt_irec_t *ret_imap, + int *nmaps) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_iocore_t *io = &ip->i_iocore; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_fsize_t isize; + xfs_fsblock_t firstblock; + int nimaps; + int error; + xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; + int aeof; + int fsynced = 0; + + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); + + /* + * Make sure that the dquots are there. This doesn't hold + * the ilock across a disk read. + */ + + error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED); + if (error) + return XFS_ERROR(error); + +retry: + isize = ip->i_d.di_size; + if (io->io_new_size > isize) { + isize = io->io_new_size; + } + + aeof = 0; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + /* + * If the caller is doing a write at the end of the file, + * then extend the allocation (and the buffer used for the write) + * out to the file system's write iosize. We clean up any extra + * space left over when the file is closed in xfs_inactive(). + * + * We don't bother with this for sync writes, because we need + * to minimize the amount we write for good performance. + */ + if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) { + xfs_off_t aligned_offset; + unsigned int iosize; + xfs_fileoff_t ioalign; + + iosize = mp->m_writeio_blocks; + aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); + ioalign = XFS_B_TO_FSBT(mp, aligned_offset); + last_fsb = ioalign + iosize; + aeof = 1; + } + + nimaps = XFS_WRITE_IMAPS; + firstblock = NULLFSBLOCK; + + /* + * roundup the allocation request to m_dalign boundary if file size + * is greater that 512K and we are allocating past the allocation eof + */ + if (mp->m_dalign && (isize >= mp->m_dalign) && aeof) { + int eof; + xfs_fileoff_t new_last_fsb; + new_last_fsb = roundup_64(last_fsb, mp->m_dalign); + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); + if (error) { + return error; + } + if (eof) { + last_fsb = new_last_fsb; + } + } + + error = xfs_bmapi(NULL, ip, offset_fsb, + (xfs_filblks_t)(last_fsb - offset_fsb), + XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | + XFS_BMAPI_ENTIRE, &firstblock, 1, imap, + &nimaps, NULL); + /* + * This can be EDQUOT, if nimaps == 0 + */ + if (error && (error != ENOSPC)) { + return XFS_ERROR(error); + } + /* + * If bmapi returned us nothing, and if we didn't get back EDQUOT, + * then we must have run out of space. + */ + + if (nimaps == 0) { + if (xfs_flush_space(ip, &fsynced, &ioflag)) + return XFS_ERROR(ENOSPC); + + error = 0; + goto retry; + } + + *ret_imap = imap[0]; + *nmaps = 1; + return 0; +} + +/* + * Pass in a delayed allocate extent, convert it to real extents; + * return to the caller the extent we create which maps on top of + * the originating callers request. + * + * Called without a lock on the inode. + */ +int +xfs_iomap_write_allocate( + xfs_inode_t *ip, + xfs_bmbt_irec_t *map, + int *retmap) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, last_block; + xfs_fileoff_t end_fsb, map_start_fsb; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + xfs_filblks_t count_fsb; + xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS]; + xfs_trans_t *tp; + int i, nimaps, committed; + int error = 0; + int nres; + + *retmap = 0; + + /* + * Make sure that the dquots are there. + */ + + if ((error = XFS_QM_DQATTACH(mp, ip, 0))) + return XFS_ERROR(error); + + offset_fsb = map->br_startoff; + count_fsb = map->br_blockcount; + map_start_fsb = offset_fsb; + + XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); + + while (count_fsb != 0) { + /* + * Set up a transaction with which to allocate the + * backing store for the file. Do allocations in a + * loop until we get some space in the range we are + * interested in. The other space that might be allocated + * is in the delayed allocation extent on which we sit + * but before our buffer starts. + */ + + nimaps = 0; + while (nimaps == 0) { + tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_reserve(tp, nres, + XFS_WRITE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + + if (error == ENOSPC) { + error = xfs_trans_reserve(tp, 0, + XFS_WRITE_LOG_RES(mp), + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + } + if (error) { + xfs_trans_cancel(tp, 0); + return XFS_ERROR(error); + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + XFS_BMAP_INIT(&free_list, &first_block); + + nimaps = XFS_STRAT_WRITE_IMAPS; + /* + * Ensure we don't go beyond eof - it is possible + * the extents changed since we did the read call, + * we dropped the ilock in the interim. + */ + + end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size); + xfs_bmap_last_offset(NULL, ip, &last_block, + XFS_DATA_FORK); + last_block = XFS_FILEOFF_MAX(last_block, end_fsb); + if ((map_start_fsb + count_fsb) > last_block) { + count_fsb = last_block - map_start_fsb; + if (count_fsb == 0) { + error = EAGAIN; + goto trans_cancel; + } + } + + /* Go get the actual blocks */ + error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, + XFS_BMAPI_WRITE, &first_block, 1, + imap, &nimaps, &free_list); + + if (error) + goto trans_cancel; + + error = xfs_bmap_finish(&tp, &free_list, + first_block, &committed); + + if (error) + goto trans_cancel; + + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES, NULL); + + if (error) + goto error0; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + /* + * See if we were able to allocate an extent that + * covers at least part of the callers request + */ + + for (i = 0; i < nimaps; i++) { + if ((map->br_startoff >= imap[i].br_startoff) && + (map->br_startoff < (imap[i].br_startoff + + imap[i].br_blockcount))) { + *map = imap[i]; + *retmap = 1; + XFS_STATS_INC(xs_xstrat_quick); + return 0; + } + count_fsb -= imap[i].br_blockcount; + } + + /* So far we have not mapped the requested part of the + * file, just surrounding data, try again. + */ + nimaps--; + offset_fsb = imap[nimaps].br_startoff + + imap[nimaps].br_blockcount; + map_start_fsb = offset_fsb; + } + +trans_cancel: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error0: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return XFS_ERROR(error); +} + +int +xfs_iomap_write_unwritten( + xfs_inode_t *ip, + loff_t offset, + size_t count) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + xfs_fileoff_t offset_fsb; + xfs_filblks_t count_fsb; + xfs_filblks_t numblks_fsb; + xfs_bmbt_irec_t imap; + int committed; + int error; + int nres; + int nimaps; + xfs_fsblock_t firstfsb; + xfs_bmap_free_t free_list; + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + count_fsb = XFS_B_TO_FSB(mp, count); + + do { + nres = XFS_DIOSTRAT_SPACE_RES(mp, 0); + + /* + * set up a transaction to convert the range of extents + * from unwritten to real. Do allocations in a loop until + * we have covered the range passed in. + */ + + tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + error = xfs_trans_reserve(tp, nres, + XFS_WRITE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + goto error0; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + /* + * Modify the unwritten extent state of the buffer. + */ + XFS_BMAP_INIT(&free_list, &firstfsb); + nimaps = 1; + error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_WRITE, &firstfsb, + 1, &imap, &nimaps, &free_list); + if (error) + goto error_on_bmapi_transaction; + + error = xfs_bmap_finish(&(tp), &(free_list), + firstfsb, &committed); + if (error) + goto error_on_bmapi_transaction; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto error0; + + if ((numblks_fsb = imap.br_blockcount) == 0) { + /* + * The numblks_fsb value should always get + * smaller, otherwise the loop is stuck. + */ + ASSERT(imap.br_blockcount); + break; + } + offset_fsb += numblks_fsb; + count_fsb -= numblks_fsb; + } while (count_fsb > 0); + + return 0; + +error_on_bmapi_transaction: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +error0: + return XFS_ERROR(error); +} diff -Nru a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c --- a/fs/xfs/xfs_log_recover.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs_log_recover.c Thu Jan 29 23:15:58 2004 @@ -1531,7 +1531,7 @@ xlog_recover_item_t *first_item, *itemq, *itemq_next; xfs_buf_log_format_t *buf_f; xfs_buf_log_format_v1_t *obuf_f; - ushort flags; + ushort flags = 0; first_item = itemq = trans->r_itemq; trans->r_itemq = NULL; diff -Nru a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h --- a/fs/xfs/xfs_types.h Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs_types.h Thu Jan 29 23:15:58 2004 @@ -75,24 +75,6 @@ #error BITS_PER_LONG must be 32 or 64 #endif -/* - * Some types are conditional depending on the target system. - * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. - * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well - * as requiring XFS_BIG_BLKNOS to be set. - */ -#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) -# define XFS_BIG_BLKNOS 1 -# if BITS_PER_LONG == 64 -# define XFS_BIG_INUMS 1 -# else -# define XFS_BIG_INUMS 0 -# endif -#else -# define XFS_BIG_BLKNOS 0 -# define XFS_BIG_INUMS 0 -#endif - #endif /* __KERNEL__ */ typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ diff -Nru a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c --- a/fs/xfs/xfs_vfsops.c Thu Jan 29 23:15:58 2004 +++ b/fs/xfs/xfs_vfsops.c Thu Jan 29 23:15:58 2004 @@ -171,6 +171,25 @@ xfs_sysctl_unregister(); xfs_refcache_destroy(); +#ifdef XFS_DIR2_TRACE + ktrace_free(xfs_dir2_trace_buf); +#endif +#ifdef XFS_ATTR_TRACE + ktrace_free(xfs_attr_trace_buf); +#endif +#ifdef XFS_DIR_TRACE + ktrace_free(xfs_dir_trace_buf); +#endif +#ifdef XFS_BMBT_TRACE + ktrace_free(xfs_bmbt_trace_buf); +#endif +#ifdef XFS_BMAP_TRACE + ktrace_free(xfs_bmap_trace_buf); +#endif +#ifdef XFS_ALLOC_TRACE + ktrace_free(xfs_alloc_trace_buf); +#endif + kmem_cache_destroy(xfs_bmap_free_item_zone); kmem_cache_destroy(xfs_btree_cur_zone); kmem_cache_destroy(xfs_inode_zone);