Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

76
fs/ocfs2/Kconfig Normal file
View file

@ -0,0 +1,76 @@
config OCFS2_FS
tristate "OCFS2 file system support"
depends on NET && SYSFS && CONFIGFS_FS
select JBD2
select CRC32
select QUOTA
select QUOTA_TREE
select FS_POSIX_ACL
help
OCFS2 is a general purpose extent based shared disk cluster file
system with many similarities to ext3. It supports 64 bit inode
numbers, and has automatically extending metadata groups which may
also make it attractive for non-clustered use.
You'll want to install the ocfs2-tools package in order to at least
get "mount.ocfs2".
Project web page: http://oss.oracle.com/projects/ocfs2
Tools web page: http://oss.oracle.com/projects/ocfs2-tools
OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
For more information on OCFS2, see the file
<file:Documentation/filesystems/ocfs2.txt>.
config OCFS2_FS_O2CB
tristate "O2CB Kernelspace Clustering"
depends on OCFS2_FS
default y
help
OCFS2 includes a simple kernelspace clustering package, the OCFS2
Cluster Base. It only requires a very small userspace component
to configure it. This comes with the standard ocfs2-tools package.
O2CB is limited to maintaining a cluster for OCFS2 file systems.
It cannot manage any other cluster applications.
It is always safe to say Y here, as the clustering method is
run-time selectable.
config OCFS2_FS_USERSPACE_CLUSTER
tristate "OCFS2 Userspace Clustering"
depends on OCFS2_FS && DLM
default y
help
This option will allow OCFS2 to use userspace clustering services
in conjunction with the DLM in fs/dlm. If you are using a
userspace cluster manager, say Y here.
It is safe to say Y, as the clustering method is run-time
selectable.
config OCFS2_FS_STATS
bool "OCFS2 statistics"
depends on OCFS2_FS && DEBUG_FS
default y
help
This option allows some fs statistics to be captured. Enabling
this option may increase the memory consumption.
config OCFS2_DEBUG_MASKLOG
bool "OCFS2 logging support"
depends on OCFS2_FS
default y
help
The ocfs2 filesystem has an extensive logging system. The system
allows selection of events to log via files in /sys/o2cb/logmask/.
This option will enlarge your kernel, but it allows debugging of
ocfs2 filesystem issues.
config OCFS2_DEBUG_FS
bool "OCFS2 expensive checks"
depends on OCFS2_FS
default n
help
This option will enable expensive consistency checks. Enable
this option for debugging only as it is likely to decrease
performance of the filesystem.

53
fs/ocfs2/Makefile Normal file
View file

@ -0,0 +1,53 @@
ccflags-y := -Ifs/ocfs2
ccflags-y += -DCATCH_BH_JBD_RACES
obj-$(CONFIG_OCFS2_FS) += \
ocfs2.o \
ocfs2_stackglue.o
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
ocfs2-objs := \
alloc.o \
aops.o \
blockcheck.o \
buffer_head_io.o \
dcache.o \
dir.o \
dlmglue.o \
export.o \
extent_map.o \
file.o \
heartbeat.o \
inode.o \
ioctl.o \
journal.o \
localalloc.o \
locks.o \
mmap.o \
namei.o \
refcounttree.o \
reservations.o \
move_extents.o \
resize.o \
slot_map.o \
suballoc.o \
super.o \
symlink.o \
sysfile.o \
uptodate.o \
quota_local.o \
quota_global.o \
xattr.o \
acl.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
ocfs2_stack_user-objs := stack_user.o
obj-$(CONFIG_OCFS2_FS) += dlmfs/
# cluster/ is always needed when OCFS2_FS for masklog support
obj-$(CONFIG_OCFS2_FS) += cluster/
obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/

312
fs/ocfs2/acl.c Normal file
View file

@ -0,0 +1,312 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* acl.c
*
* Copyright (C) 2004, 2008 Oracle. All rights reserved.
*
* CREDITS:
* Lots of code in this file is copy from linux/fs/ext3/acl.c.
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
#include "journal.h"
#include "ocfs2_fs.h"
#include "xattr.h"
#include "acl.h"
/*
* Convert from xattr value to acl struct.
*/
static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
{
int n, count;
struct posix_acl *acl;
if (!value)
return NULL;
if (size < sizeof(struct posix_acl_entry))
return ERR_PTR(-EINVAL);
count = size / sizeof(struct posix_acl_entry);
acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
for (n = 0; n < count; n++) {
struct ocfs2_acl_entry *entry =
(struct ocfs2_acl_entry *)value;
acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
switch(acl->a_entries[n].e_tag) {
case ACL_USER:
acl->a_entries[n].e_uid =
make_kuid(&init_user_ns,
le32_to_cpu(entry->e_id));
break;
case ACL_GROUP:
acl->a_entries[n].e_gid =
make_kgid(&init_user_ns,
le32_to_cpu(entry->e_id));
break;
default:
break;
}
value += sizeof(struct posix_acl_entry);
}
return acl;
}
/*
* Convert acl struct to xattr value.
*/
static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
{
struct ocfs2_acl_entry *entry = NULL;
char *ocfs2_acl;
size_t n;
*size = acl->a_count * sizeof(struct posix_acl_entry);
ocfs2_acl = kmalloc(*size, GFP_NOFS);
if (!ocfs2_acl)
return ERR_PTR(-ENOMEM);
entry = (struct ocfs2_acl_entry *)ocfs2_acl;
for (n = 0; n < acl->a_count; n++, entry++) {
entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
switch(acl->a_entries[n].e_tag) {
case ACL_USER:
entry->e_id = cpu_to_le32(
from_kuid(&init_user_ns,
acl->a_entries[n].e_uid));
break;
case ACL_GROUP:
entry->e_id = cpu_to_le32(
from_kgid(&init_user_ns,
acl->a_entries[n].e_gid));
break;
default:
entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
break;
}
}
return ocfs2_acl;
}
static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
int type,
struct buffer_head *di_bh)
{
int name_index;
char *value = NULL;
struct posix_acl *acl;
int retval;
switch (type) {
case ACL_TYPE_ACCESS:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
break;
default:
return ERR_PTR(-EINVAL);
}
retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
if (retval > 0) {
value = kmalloc(retval, GFP_NOFS);
if (!value)
return ERR_PTR(-ENOMEM);
retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
"", value, retval);
}
if (retval > 0)
acl = ocfs2_acl_from_xattr(value, retval);
else if (retval == -ENODATA || retval == 0)
acl = NULL;
else
acl = ERR_PTR(retval);
kfree(value);
return acl;
}
/*
* Helper function to set i_mode in memory and disk. Some call paths
* will not have di_bh or a journal handle to pass, in which case it
* will create it's own.
*/
static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
handle_t *handle, umode_t new_mode)
{
int ret, commit_handle = 0;
struct ocfs2_dinode *di;
if (di_bh == NULL) {
ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
} else
get_bh(di_bh);
if (handle == NULL) {
handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out_brelse;
}
commit_handle = 1;
}
di = (struct ocfs2_dinode *)di_bh->b_data;
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
inode->i_mode = new_mode;
inode->i_ctime = CURRENT_TIME;
di->i_mode = cpu_to_le16(inode->i_mode);
di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
out_commit:
if (commit_handle)
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out_brelse:
brelse(di_bh);
out:
return ret;
}
/*
* Set the access or default ACL of an inode.
*/
int ocfs2_set_acl(handle_t *handle,
struct inode *inode,
struct buffer_head *di_bh,
int type,
struct posix_acl *acl,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_alloc_context *data_ac)
{
int name_index;
void *value = NULL;
size_t size = 0;
int ret;
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
switch (type) {
case ACL_TYPE_ACCESS:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
umode_t mode = inode->i_mode;
ret = posix_acl_equiv_mode(acl, &mode);
if (ret < 0)
return ret;
else {
if (ret == 0)
acl = NULL;
ret = ocfs2_acl_set_mode(inode, di_bh,
handle, mode);
if (ret)
return ret;
}
}
break;
case ACL_TYPE_DEFAULT:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
break;
default:
return -EINVAL;
}
if (acl) {
value = ocfs2_acl_to_xattr(acl, &size);
if (IS_ERR(value))
return (int)PTR_ERR(value);
}
if (handle)
ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
"", value, size, 0,
meta_ac, data_ac);
else
ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
kfree(value);
return ret;
}
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
}
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
{
struct ocfs2_super *osb;
struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
int ret = -EAGAIN;
osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return NULL;
ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret < 0)
return ERR_PTR(ret);
acl = ocfs2_get_acl_nolock(inode, type, di_bh);
brelse(di_bh);
return acl;
}

39
fs/ocfs2/acl.h Normal file
View file

@ -0,0 +1,39 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* acl.h
*
* Copyright (C) 2004, 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef OCFS2_ACL_H
#define OCFS2_ACL_H
#include <linux/posix_acl_xattr.h>
struct ocfs2_acl_entry {
__le16 e_tag;
__le16 e_perm;
__le32 e_id;
};
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
int ocfs2_set_acl(handle_t *handle,
struct inode *inode,
struct buffer_head *di_bh,
int type,
struct posix_acl *acl,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_alloc_context *data_ac);
#endif /* OCFS2_ACL_H */

7395
fs/ocfs2/alloc.c Normal file

File diff suppressed because it is too large Load diff

324
fs/ocfs2/alloc.h Normal file
View file

@ -0,0 +1,324 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* alloc.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_ALLOC_H
#define OCFS2_ALLOC_H
/*
* For xattr tree leaf, we limit the leaf byte size to be 64K.
*/
#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
/*
* ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
* the b-tree operations in ocfs2. Now all the b-tree operations are not
* limited to ocfs2_dinode only. Any data which need to allocate clusters
* to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
* and operation.
*
* ocfs2_extent_tree becomes the first-class object for extent tree
* manipulation. Callers of the alloc.c code need to fill it via one of
* the ocfs2_init_*_extent_tree() operations below.
*
* ocfs2_extent_tree contains info for the root of the b-tree, it must have a
* root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
* functions. It needs the ocfs2_caching_info structure associated with
* I/O on the tree. With metadata ecc, we now call different journal_access
* functions for each type of metadata, so it must have the
* root_journal_access function.
* ocfs2_extent_tree_operations abstract the normal operations we do for
* the root of extent b-tree.
*/
struct ocfs2_extent_tree_operations;
struct ocfs2_extent_tree {
struct ocfs2_extent_tree_operations *et_ops;
struct buffer_head *et_root_bh;
struct ocfs2_extent_list *et_root_el;
struct ocfs2_caching_info *et_ci;
ocfs2_journal_access_func et_root_journal_access;
void *et_object;
unsigned int et_max_leaf_clusters;
};
/*
* ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
* specified object buffer.
*/
void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
struct ocfs2_caching_info *ci,
struct buffer_head *bh);
void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
struct ocfs2_caching_info *ci,
struct buffer_head *bh);
struct ocfs2_xattr_value_buf;
void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
struct ocfs2_caching_info *ci,
struct ocfs2_xattr_value_buf *vb);
void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
struct ocfs2_caching_info *ci,
struct buffer_head *bh);
void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
struct ocfs2_caching_info *ci,
struct buffer_head *bh);
/*
* Read an extent block into *bh. If *bh is NULL, a bh will be
* allocated. This is a cached read. The extent block will be validated
* with ocfs2_validate_extent_block().
*/
int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
struct buffer_head **bh);
struct ocfs2_alloc_context;
int ocfs2_insert_extent(handle_t *handle,
struct ocfs2_extent_tree *et,
u32 cpos,
u64 start_blk,
u32 new_clusters,
u8 flags,
struct ocfs2_alloc_context *meta_ac);
enum ocfs2_alloc_restarted {
RESTART_NONE = 0,
RESTART_TRANS,
RESTART_META
};
int ocfs2_add_clusters_in_btree(handle_t *handle,
struct ocfs2_extent_tree *et,
u32 *logical_offset,
u32 clusters_to_add,
int mark_unwritten,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret);
struct ocfs2_cached_dealloc_ctxt;
struct ocfs2_path;
int ocfs2_split_extent(handle_t *handle,
struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
int split_index,
struct ocfs2_extent_rec *split_rec,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_mark_extent_written(struct inode *inode,
struct ocfs2_extent_tree *et,
handle_t *handle, u32 cpos, u32 len, u32 phys,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_change_extent_flag(handle_t *handle,
struct ocfs2_extent_tree *et,
u32 cpos, u32 len, u32 phys,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc,
int new_flags, int clear_flags);
int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
u32 cpos, u32 len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 cpos, u32 phys_cpos, u32 len, int flags,
struct ocfs2_cached_dealloc_ctxt *dealloc,
u64 refcount_loc);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct ocfs2_extent_tree *et);
/*
* how many new metadata chunks would an allocation need at maximum?
*
* Please note that the caller must make sure that root_el is the root
* of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
* the result may be wrong.
*/
static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
{
/*
* Rather than do all the work of determining how much we need
* (involves a ton of reads and locks), just ask for the
* maximal limit. That's a tree depth shift. So, one block for
* level of the tree (current l_tree_depth), one block for the
* new tree_depth==0 extent_block, and one block at the new
* top-of-the tree.
*/
return le16_to_cpu(root_el->l_tree_depth) + 2;
}
void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di);
int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct buffer_head *di_bh);
int ocfs2_truncate_log_init(struct ocfs2_super *osb);
void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
int cancel);
int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
int slot_num,
struct ocfs2_dinode **tl_copy);
int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode *tl_copy);
int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
int ocfs2_truncate_log_append(struct ocfs2_super *osb,
handle_t *handle,
u64 start_blk,
unsigned int num_clusters);
int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
/*
* Process local structure which describes the block unlinks done
* during an operation. This is populated via
* ocfs2_cache_block_dealloc().
*
* ocfs2_run_deallocs() should be called after the potentially
* de-allocating routines. No journal handles should be open, and most
* locks should have been dropped.
*/
struct ocfs2_cached_dealloc_ctxt {
struct ocfs2_per_slot_free_list *c_first_suballocator;
struct ocfs2_cached_block_free *c_global_allocator;
};
static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
{
c->c_first_suballocator = NULL;
c->c_global_allocator = NULL;
}
int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
u64 blkno, unsigned int bit);
int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
int type, int slot, u64 suballoc, u64 blkno,
unsigned int bit);
static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
{
return c->c_global_allocator != NULL;
}
int ocfs2_run_deallocs(struct ocfs2_super *osb,
struct ocfs2_cached_dealloc_ctxt *ctxt);
struct ocfs2_truncate_context {
struct ocfs2_cached_dealloc_ctxt tc_dealloc;
int tc_ext_alloc_locked; /* is it cluster locked? */
/* these get destroyed once it's passed to ocfs2_commit_truncate. */
struct buffer_head *tc_last_eb_bh;
};
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
u64 range_start, u64 range_end);
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *di_bh);
int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
unsigned int start, unsigned int end, int trunc);
int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
struct ocfs2_extent_list *root_el, u32 cpos,
struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
/*
* Helper function to look at the # of clusters in an extent record.
*/
static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
struct ocfs2_extent_rec *rec)
{
/*
* Cluster count in extent records is slightly different
* between interior nodes and leaf nodes. This is to support
* unwritten extents which need a flags field in leaf node
* records, thus shrinking the available space for a clusters
* field.
*/
if (el->l_tree_depth)
return le32_to_cpu(rec->e_int_clusters);
else
return le16_to_cpu(rec->e_leaf_clusters);
}
/*
* This is only valid for leaf nodes, which are the only ones that can
* have empty extents anyway.
*/
static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
{
return !rec->e_leaf_clusters;
}
int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
struct page **pages, int *num);
void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
unsigned int from, unsigned int to,
struct page *page, int zero, u64 *phys);
/*
* Structures which describe a path through a btree, and functions to
* manipulate them.
*
* The idea here is to be as generic as possible with the tree
* manipulation code.
*/
struct ocfs2_path_item {
struct buffer_head *bh;
struct ocfs2_extent_list *el;
};
#define OCFS2_MAX_PATH_DEPTH 5
struct ocfs2_path {
int p_tree_depth;
ocfs2_journal_access_func p_root_access;
struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
};
#define path_root_bh(_path) ((_path)->p_node[0].bh)
#define path_root_el(_path) ((_path)->p_node[0].el)
#define path_root_access(_path)((_path)->p_root_access)
#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
#define path_num_items(_path) ((_path)->p_tree_depth + 1)
void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
void ocfs2_free_path(struct ocfs2_path *path);
int ocfs2_find_path(struct ocfs2_caching_info *ci,
struct ocfs2_path *path,
u32 cpos);
struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
int ocfs2_path_bh_journal_access(handle_t *handle,
struct ocfs2_caching_info *ci,
struct ocfs2_path *path,
int idx);
int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
handle_t *handle,
struct ocfs2_path *path);
int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
struct ocfs2_path *path, u32 *cpos);
int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
struct ocfs2_path *path, u32 *cpos);
int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
struct ocfs2_path *left,
struct ocfs2_path *right);
#endif /* OCFS2_ALLOC_H */

2094
fs/ocfs2/aops.c Normal file

File diff suppressed because it is too large Load diff

105
fs/ocfs2/aops.h Normal file
View file

@ -0,0 +1,105 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
#include <linux/aio.h>
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
unsigned to);
int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
struct inode *inode, unsigned int from,
unsigned int to, int new);
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages);
int walk_page_buffers( handle_t *handle,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
int (*fn)( handle_t *handle,
struct buffer_head *bh));
int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
int ocfs2_write_begin_nolock(struct file *filp,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
int ocfs2_read_inline_data(struct inode *inode, struct page *page,
struct buffer_head *di_bh);
int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
int ocfs2_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
/* all ocfs2_dio_end_io()'s fault */
#define ocfs2_iocb_is_rw_locked(iocb) \
test_bit(0, (unsigned long *)&iocb->private)
static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
{
set_bit(0, (unsigned long *)&iocb->private);
if (level)
set_bit(1, (unsigned long *)&iocb->private);
else
clear_bit(1, (unsigned long *)&iocb->private);
}
/*
* Using a named enum representing lock types in terms of #N bit stored in
* iocb->private, which is going to be used for communication between
* ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
*/
enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL,
OCFS2_IOCB_SEM,
OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS
};
#define ocfs2_iocb_clear_rw_locked(iocb) \
clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
#define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
#define ocfs2_iocb_set_sem_locked(iocb) \
set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
#define ocfs2_iocb_clear_sem_locked(iocb) \
clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
#define ocfs2_iocb_is_sem_locked(iocb) \
test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
#define ocfs2_iocb_set_unaligned_aio(iocb) \
set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
#define ocfs2_iocb_clear_unaligned_aio(iocb) \
clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
#define ocfs2_iocb_is_unaligned_aio(iocb) \
test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
#endif /* OCFS2_FILE_H */

647
fs/ocfs2/blockcheck.c Normal file
View file

@ -0,0 +1,647 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* blockcheck.c
*
* Checksum and ECC codes for the OCFS2 userspace library.
*
* Copyright (C) 2006, 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/crc32.h>
#include <linux/buffer_head.h>
#include <linux/bitops.h>
#include <linux/debugfs.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <asm/byteorder.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "blockcheck.h"
/*
* We use the following conventions:
*
* d = # data bits
* p = # parity bits
* c = # total code bits (d + p)
*/
/*
* Calculate the bit offset in the hamming code buffer based on the bit's
* offset in the data buffer. Since the hamming code reserves all
* power-of-two bits for parity, the data bit number and the code bit
* number are offset by all the parity bits beforehand.
*
* Recall that bit numbers in hamming code are 1-based. This function
* takes the 0-based data bit from the caller.
*
* An example. Take bit 1 of the data buffer. 1 is a power of two (2^0),
* so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit.
* 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3
* in the code buffer.
*
* The caller can pass in *p if it wants to keep track of the most recent
* number of parity bits added. This allows the function to start the
* calculation at the last place.
*/
static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
{
unsigned int b, p = 0;
/*
* Data bits are 0-based, but we're talking code bits, which
* are 1-based.
*/
b = i + 1;
/* Use the cache if it is there */
if (p_cache)
p = *p_cache;
b += p;
/*
* For every power of two below our bit number, bump our bit.
*
* We compare with (b + 1) because we have to compare with what b
* would be _if_ it were bumped up by the parity bit. Capice?
*
* p is set above.
*/
for (; (1 << p) < (b + 1); p++)
b++;
if (p_cache)
*p_cache = p;
return b;
}
/*
* This is the low level encoder function. It can be called across
* multiple hunks just like the crc32 code. 'd' is the number of bits
* _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
* two 512B buffers, you would do it like so:
*
* parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
* parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
*
* If you just have one buffer, use ocfs2_hamming_encode_block().
*/
u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
{
unsigned int i, b, p = 0;
BUG_ON(!d);
/*
* b is the hamming code bit number. Hamming code specifies a
* 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is
* for the algorithm.
*
* The i++ in the for loop is so that the start offset passed
* to ocfs2_find_next_bit_set() is one greater than the previously
* found bit.
*/
for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
{
/*
* i is the offset in this hunk, nr + i is the total bit
* offset.
*/
b = calc_code_bit(nr + i, &p);
/*
* Data bits in the resultant code are checked by
* parity bits that are part of the bit number
* representation. Huh?
*
* <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
* In other words, the parity bit at position 2^k
* checks bits in positions having bit k set in
* their binary representation. Conversely, for
* instance, bit 13, i.e. 1101(2), is checked by
* bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
* </wikipedia>
*
* Note that 'k' is the _code_ bit number. 'b' in
* our loop.
*/
parity ^= b;
}
/* While the data buffer was treated as little endian, the
* return value is in host endian. */
return parity;
}
u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
{
return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
}
/*
* Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
* offset of the current hunk. If bit to be fixed is not part of the
* current hunk, this does nothing.
*
* If you only have one hunk, use ocfs2_hamming_fix_block().
*/
void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
unsigned int fix)
{
unsigned int i, b;
BUG_ON(!d);
/*
* If the bit to fix has an hweight of 1, it's a parity bit. One
* busted parity bit is its own error. Nothing to do here.
*/
if (hweight32(fix) == 1)
return;
/*
* nr + d is the bit right past the data hunk we're looking at.
* If fix after that, nothing to do
*/
if (fix >= calc_code_bit(nr + d, NULL))
return;
/*
* nr is the offset in the data hunk we're starting at. Let's
* start b at the offset in the code buffer. See hamming_encode()
* for a more detailed description of 'b'.
*/
b = calc_code_bit(nr, NULL);
/* If the fix is before this hunk, nothing to do */
if (fix < b)
return;
for (i = 0; i < d; i++, b++)
{
/* Skip past parity bits */
while (hweight32(b) == 1)
b++;
/*
* i is the offset in this data hunk.
* nr + i is the offset in the total data buffer.
* b is the offset in the total code buffer.
*
* Thus, when b == fix, bit i in the current hunk needs
* fixing.
*/
if (b == fix)
{
if (ocfs2_test_bit(i, data))
ocfs2_clear_bit(i, data);
else
ocfs2_set_bit(i, data);
break;
}
}
}
void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
unsigned int fix)
{
ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
}
/*
* Debugfs handling.
*/
#ifdef CONFIG_DEBUG_FS
static int blockcheck_u64_get(void *data, u64 *val)
{
*val = *(u64 *)data;
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
static struct dentry *blockcheck_debugfs_create(const char *name,
struct dentry *parent,
u64 *value)
{
return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
&blockcheck_fops);
}
static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
{
if (stats) {
debugfs_remove(stats->b_debug_check);
stats->b_debug_check = NULL;
debugfs_remove(stats->b_debug_failure);
stats->b_debug_failure = NULL;
debugfs_remove(stats->b_debug_recover);
stats->b_debug_recover = NULL;
debugfs_remove(stats->b_debug_dir);
stats->b_debug_dir = NULL;
}
}
static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
struct dentry *parent)
{
int rc = -EINVAL;
if (!stats)
goto out;
stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
if (!stats->b_debug_dir)
goto out;
stats->b_debug_check =
blockcheck_debugfs_create("blocks_checked",
stats->b_debug_dir,
&stats->b_check_count);
stats->b_debug_failure =
blockcheck_debugfs_create("checksums_failed",
stats->b_debug_dir,
&stats->b_failure_count);
stats->b_debug_recover =
blockcheck_debugfs_create("ecc_recoveries",
stats->b_debug_dir,
&stats->b_recover_count);
if (stats->b_debug_check && stats->b_debug_failure &&
stats->b_debug_recover)
rc = 0;
out:
if (rc)
ocfs2_blockcheck_debug_remove(stats);
return rc;
}
#else
static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
struct dentry *parent)
{
return 0;
}
static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
{
}
#endif /* CONFIG_DEBUG_FS */
/* Always-called wrappers for starting and stopping the debugfs files */
int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
struct dentry *parent)
{
return ocfs2_blockcheck_debug_install(stats, parent);
}
void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)
{
ocfs2_blockcheck_debug_remove(stats);
}
static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats)
{
u64 new_count;
if (!stats)
return;
spin_lock(&stats->b_lock);
stats->b_check_count++;
new_count = stats->b_check_count;
spin_unlock(&stats->b_lock);
if (!new_count)
mlog(ML_NOTICE, "Block check count has wrapped\n");
}
static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats)
{
u64 new_count;
if (!stats)
return;
spin_lock(&stats->b_lock);
stats->b_failure_count++;
new_count = stats->b_failure_count;
spin_unlock(&stats->b_lock);
if (!new_count)
mlog(ML_NOTICE, "Checksum failure count has wrapped\n");
}
static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats)
{
u64 new_count;
if (!stats)
return;
spin_lock(&stats->b_lock);
stats->b_recover_count++;
new_count = stats->b_recover_count;
spin_unlock(&stats->b_lock);
if (!new_count)
mlog(ML_NOTICE, "ECC recovery count has wrapped\n");
}
/*
* These are the low-level APIs for using the ocfs2_block_check structure.
*/
/*
* This function generates check information for a block.
* data is the block to be checked. bc is a pointer to the
* ocfs2_block_check structure describing the crc32 and the ecc.
*
* bc should be a pointer inside data, as the function will
* take care of zeroing it before calculating the check information. If
* bc does not point inside data, the caller must make sure any inline
* ocfs2_block_check structures are zeroed.
*
* The data buffer must be in on-disk endian (little endian for ocfs2).
* bc will be filled with little-endian values and will be ready to go to
* disk.
*/
void ocfs2_block_check_compute(void *data, size_t blocksize,
struct ocfs2_block_check *bc)
{
u32 crc;
u32 ecc;
memset(bc, 0, sizeof(struct ocfs2_block_check));
crc = crc32_le(~0, data, blocksize);
ecc = ocfs2_hamming_encode_block(data, blocksize);
/*
* No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
* larger than 16 bits.
*/
BUG_ON(ecc > USHRT_MAX);
bc->bc_crc32e = cpu_to_le32(crc);
bc->bc_ecc = cpu_to_le16((u16)ecc);
}
/*
* This function validates existing check information. Like _compute,
* the function will take care of zeroing bc before calculating check codes.
* If bc is not a pointer inside data, the caller must have zeroed any
* inline ocfs2_block_check structures.
*
* Again, the data passed in should be the on-disk endian.
*/
int ocfs2_block_check_validate(void *data, size_t blocksize,
struct ocfs2_block_check *bc,
struct ocfs2_blockcheck_stats *stats)
{
int rc = 0;
u32 bc_crc32e;
u16 bc_ecc;
u32 crc, ecc;
ocfs2_blockcheck_inc_check(stats);
bc_crc32e = le32_to_cpu(bc->bc_crc32e);
bc_ecc = le16_to_cpu(bc->bc_ecc);
memset(bc, 0, sizeof(struct ocfs2_block_check));
/* Fast path - if the crc32 validates, we're good to go */
crc = crc32_le(~0, data, blocksize);
if (crc == bc_crc32e)
goto out;
ocfs2_blockcheck_inc_failure(stats);
mlog(ML_ERROR,
"CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
(unsigned int)bc_crc32e, (unsigned int)crc);
/* Ok, try ECC fixups */
ecc = ocfs2_hamming_encode_block(data, blocksize);
ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc);
/* And check the crc32 again */
crc = crc32_le(~0, data, blocksize);
if (crc == bc_crc32e) {
ocfs2_blockcheck_inc_recover(stats);
goto out;
}
mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
(unsigned int)bc_crc32e, (unsigned int)crc);
rc = -EIO;
out:
bc->bc_crc32e = cpu_to_le32(bc_crc32e);
bc->bc_ecc = cpu_to_le16(bc_ecc);
return rc;
}
/*
* This function generates check information for a list of buffer_heads.
* bhs is the blocks to be checked. bc is a pointer to the
* ocfs2_block_check structure describing the crc32 and the ecc.
*
* bc should be a pointer inside data, as the function will
* take care of zeroing it before calculating the check information. If
* bc does not point inside data, the caller must make sure any inline
* ocfs2_block_check structures are zeroed.
*
* The data buffer must be in on-disk endian (little endian for ocfs2).
* bc will be filled with little-endian values and will be ready to go to
* disk.
*/
void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
struct ocfs2_block_check *bc)
{
int i;
u32 crc, ecc;
BUG_ON(nr < 0);
if (!nr)
return;
memset(bc, 0, sizeof(struct ocfs2_block_check));
for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
/*
* The number of bits in a buffer is obviously b_size*8.
* The offset of this buffer is b_size*i, so the bit offset
* of this buffer is b_size*8*i.
*/
ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
bhs[i]->b_size * 8,
bhs[i]->b_size * 8 * i);
}
/*
* No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
* larger than 16 bits.
*/
BUG_ON(ecc > USHRT_MAX);
bc->bc_crc32e = cpu_to_le32(crc);
bc->bc_ecc = cpu_to_le16((u16)ecc);
}
/*
* This function validates existing check information on a list of
* buffer_heads. Like _compute_bhs, the function will take care of
* zeroing bc before calculating check codes. If bc is not a pointer
* inside data, the caller must have zeroed any inline
* ocfs2_block_check structures.
*
* Again, the data passed in should be the on-disk endian.
*/
int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
struct ocfs2_block_check *bc,
struct ocfs2_blockcheck_stats *stats)
{
int i, rc = 0;
u32 bc_crc32e;
u16 bc_ecc;
u32 crc, ecc, fix;
BUG_ON(nr < 0);
if (!nr)
return 0;
ocfs2_blockcheck_inc_check(stats);
bc_crc32e = le32_to_cpu(bc->bc_crc32e);
bc_ecc = le16_to_cpu(bc->bc_ecc);
memset(bc, 0, sizeof(struct ocfs2_block_check));
/* Fast path - if the crc32 validates, we're good to go */
for (i = 0, crc = ~0; i < nr; i++)
crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
if (crc == bc_crc32e)
goto out;
ocfs2_blockcheck_inc_failure(stats);
mlog(ML_ERROR,
"CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
(unsigned int)bc_crc32e, (unsigned int)crc);
/* Ok, try ECC fixups */
for (i = 0, ecc = 0; i < nr; i++) {
/*
* The number of bits in a buffer is obviously b_size*8.
* The offset of this buffer is b_size*i, so the bit offset
* of this buffer is b_size*8*i.
*/
ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
bhs[i]->b_size * 8,
bhs[i]->b_size * 8 * i);
}
fix = ecc ^ bc_ecc;
for (i = 0; i < nr; i++) {
/*
* Try the fix against each buffer. It will only affect
* one of them.
*/
ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
bhs[i]->b_size * 8 * i, fix);
}
/* And check the crc32 again */
for (i = 0, crc = ~0; i < nr; i++)
crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
if (crc == bc_crc32e) {
ocfs2_blockcheck_inc_recover(stats);
goto out;
}
mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
(unsigned int)bc_crc32e, (unsigned int)crc);
rc = -EIO;
out:
bc->bc_crc32e = cpu_to_le32(bc_crc32e);
bc->bc_ecc = cpu_to_le16(bc_ecc);
return rc;
}
/*
* These are the main API. They check the superblock flag before
* calling the underlying operations.
*
* They expect the buffer(s) to be in disk format.
*/
void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
struct ocfs2_block_check *bc)
{
if (ocfs2_meta_ecc(OCFS2_SB(sb)))
ocfs2_block_check_compute(data, sb->s_blocksize, bc);
}
int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
struct ocfs2_block_check *bc)
{
int rc = 0;
struct ocfs2_super *osb = OCFS2_SB(sb);
if (ocfs2_meta_ecc(osb))
rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc,
&osb->osb_ecc_stats);
return rc;
}
void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
struct buffer_head **bhs, int nr,
struct ocfs2_block_check *bc)
{
if (ocfs2_meta_ecc(OCFS2_SB(sb)))
ocfs2_block_check_compute_bhs(bhs, nr, bc);
}
int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
struct buffer_head **bhs, int nr,
struct ocfs2_block_check *bc)
{
int rc = 0;
struct ocfs2_super *osb = OCFS2_SB(sb);
if (ocfs2_meta_ecc(osb))
rc = ocfs2_block_check_validate_bhs(bhs, nr, bc,
&osb->osb_ecc_stats);
return rc;
}

107
fs/ocfs2/blockcheck.h Normal file
View file

@ -0,0 +1,107 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* blockcheck.h
*
* Checksum and ECC codes for the OCFS2 userspace library.
*
* Copyright (C) 2004, 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef OCFS2_BLOCKCHECK_H
#define OCFS2_BLOCKCHECK_H
/* Count errors and error correction from blockcheck.c */
struct ocfs2_blockcheck_stats {
spinlock_t b_lock;
u64 b_check_count; /* Number of blocks we've checked */
u64 b_failure_count; /* Number of failed checksums */
u64 b_recover_count; /* Number of blocks fixed by ecc */
/*
* debugfs entries, used if this is passed to
* ocfs2_blockcheck_stats_debugfs_install()
*/
struct dentry *b_debug_dir; /* Parent of the debugfs files */
struct dentry *b_debug_check; /* Exposes b_check_count */
struct dentry *b_debug_failure; /* Exposes b_failure_count */
struct dentry *b_debug_recover; /* Exposes b_recover_count */
};
/* High level block API */
void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
struct ocfs2_block_check *bc);
int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
struct ocfs2_block_check *bc);
void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
struct buffer_head **bhs, int nr,
struct ocfs2_block_check *bc);
int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
struct buffer_head **bhs, int nr,
struct ocfs2_block_check *bc);
/* Lower level API */
void ocfs2_block_check_compute(void *data, size_t blocksize,
struct ocfs2_block_check *bc);
int ocfs2_block_check_validate(void *data, size_t blocksize,
struct ocfs2_block_check *bc,
struct ocfs2_blockcheck_stats *stats);
void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
struct ocfs2_block_check *bc);
int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
struct ocfs2_block_check *bc,
struct ocfs2_blockcheck_stats *stats);
/* Debug Initialization */
int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
struct dentry *parent);
void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
/*
* Hamming code functions
*/
/*
* Encoding hamming code parity bits for a buffer.
*
* This is the low level encoder function. It can be called across
* multiple hunks just like the crc32 code. 'd' is the number of bits
* _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
* two 512B buffers, you would do it like so:
*
* parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
* parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
*
* If you just have one buffer, use ocfs2_hamming_encode_block().
*/
u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
unsigned int nr);
/*
* Fix a buffer with a bit error. The 'fix' is the original parity
* xor'd with the parity calculated now.
*
* Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
* offset of the current hunk. If bit to be fixed is not part of the
* current hunk, this does nothing.
*
* If you only have one buffer, use ocfs2_hamming_fix_block().
*/
void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
unsigned int fix);
/* Convenience wrappers for a single buffer of data */
extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
unsigned int fix);
#endif

427
fs/ocfs2/buffer_head_io.c Normal file
View file

@ -0,0 +1,427 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* io.c
*
* Buffer cache handling
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "inode.h"
#include "journal.h"
#include "uptodate.h"
#include "buffer_head_io.h"
#include "ocfs2_trace.h"
/*
* Bits on bh->b_state used by ocfs2.
*
* These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart.
*/
enum ocfs2_state_bits {
BH_NeedsValidate = BH_JBDPrivateStart,
};
/* Expand the magic b_state functions */
BUFFER_FNS(NeedsValidate, needs_validate);
int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
struct ocfs2_caching_info *ci)
{
int ret = 0;
trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
BUG_ON(buffer_jbd(bh));
/* No need to check for a soft readonly file system here. non
* journalled writes are only ever done on system files which
* can get modified during recovery even if read-only. */
if (ocfs2_is_hard_readonly(osb)) {
ret = -EROFS;
mlog_errno(ret);
goto out;
}
ocfs2_metadata_cache_io_lock(ci);
lock_buffer(bh);
set_buffer_uptodate(bh);
/* remove from dirty list before I/O. */
clear_buffer_dirty(bh);
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
submit_bh(WRITE, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh)) {
ocfs2_set_buffer_uptodate(ci, bh);
} else {
/* We don't need to remove the clustered uptodate
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
mlog_errno(ret);
}
ocfs2_metadata_cache_io_unlock(ci);
out:
return ret;
}
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[])
{
int status = 0;
unsigned int i;
struct buffer_head *bh;
trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
if (!nr)
goto bail;
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(osb->sb, block++);
if (bhs[i] == NULL) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
}
bh = bhs[i];
if (buffer_jbd(bh)) {
trace_ocfs2_read_blocks_sync_jbd(
(unsigned long long)bh->b_blocknr);
continue;
}
if (buffer_dirty(bh)) {
/* This should probably be a BUG, or
* at least return an error. */
mlog(ML_ERROR,
"trying to sync read a dirty "
"buffer! (blocknr = %llu), skipping\n",
(unsigned long long)bh->b_blocknr);
continue;
}
lock_buffer(bh);
if (buffer_jbd(bh)) {
mlog(ML_ERROR,
"block %llu had the JBD bit set "
"while I was in lock_buffer!",
(unsigned long long)bh->b_blocknr);
BUG();
}
clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
bh->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh);
}
for (i = nr; i > 0; i--) {
bh = bhs[i - 1];
/* No need to wait on the buffer if it's managed by JBD. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
/* Status won't be cleared from here on out,
* so we can safely record this and loop back
* to cleanup the other buffers. */
status = -EIO;
put_bh(bh);
bhs[i - 1] = NULL;
}
}
bail:
return status;
}
int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh))
{
int status = 0;
int i, ignore_cache = 0;
struct buffer_head *bh;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
BUG_ON(!ci);
BUG_ON((flags & OCFS2_BH_READAHEAD) &&
(flags & OCFS2_BH_IGNORE_CACHE));
if (bhs == NULL) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
if (nr < 0) {
mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
status = -EINVAL;
mlog_errno(status);
goto bail;
}
if (nr == 0) {
status = 0;
goto bail;
}
ocfs2_metadata_cache_io_lock(ci);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
ocfs2_metadata_cache_io_unlock(ci);
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
}
bh = bhs[i];
ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
/* There are three read-ahead cases here which we need to
* be concerned with. All three assume a buffer has
* previously been submitted with OCFS2_BH_READAHEAD
* and it hasn't yet completed I/O.
*
* 1) The current request is sync to disk. This rarely
* happens these days, and never when performance
* matters - the code can just wait on the buffer
* lock and re-submit.
*
* 2) The current request is cached, but not
* readahead. ocfs2_buffer_uptodate() will return
* false anyway, so we'll wind up waiting on the
* buffer lock to do I/O. We re-check the request
* with after getting the lock to avoid a re-submit.
*
* 3) The current request is readahead (and so must
* also be a caching one). We short circuit if the
* buffer is locked (under I/O) and if it's in the
* uptodate cache. The re-check from #2 catches the
* case that the previous read-ahead completes just
* before our is-it-in-flight check.
*/
if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
trace_ocfs2_read_blocks_from_disk(
(unsigned long long)bh->b_blocknr,
(unsigned long long)ocfs2_metadata_cache_owner(ci));
/* We're using ignore_cache here to say
* "go to disk" */
ignore_cache = 1;
}
trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
if (buffer_jbd(bh)) {
continue;
}
if (ignore_cache) {
if (buffer_dirty(bh)) {
/* This should probably be a BUG, or
* at least return an error. */
continue;
}
/* A read-ahead request was made - if the
* buffer is already under read-ahead from a
* previously submitted request than we are
* done here. */
if ((flags & OCFS2_BH_READAHEAD)
&& ocfs2_buffer_read_ahead(ci, bh))
continue;
lock_buffer(bh);
if (buffer_jbd(bh)) {
#ifdef CATCH_BH_JBD_RACES
mlog(ML_ERROR, "block %llu had the JBD bit set "
"while I was in lock_buffer!",
(unsigned long long)bh->b_blocknr);
BUG();
#else
unlock_buffer(bh);
continue;
#endif
}
/* Re-check ocfs2_buffer_uptodate() as a
* previously read-ahead buffer may have
* completed I/O while we were waiting for the
* buffer lock. */
if (!(flags & OCFS2_BH_IGNORE_CACHE)
&& !(flags & OCFS2_BH_READAHEAD)
&& ocfs2_buffer_uptodate(ci, bh)) {
unlock_buffer(bh);
continue;
}
clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
if (validate)
set_buffer_needs_validate(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh);
continue;
}
}
status = 0;
for (i = (nr - 1); i >= 0; i--) {
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
/* We know this can't have changed as we hold the
* owner sem. Avoid doing any work on the bh if the
* journal has it. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
/* Status won't be cleared from here on out,
* so we can safely record this and loop back
* to cleanup the other buffers. Don't need to
* remove the clustered uptodate information
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
put_bh(bh);
bhs[i] = NULL;
continue;
}
if (buffer_needs_validate(bh)) {
/* We never set NeedsValidate if the
* buffer was held by the journal, so
* that better not have changed */
BUG_ON(buffer_jbd(bh));
clear_buffer_needs_validate(bh);
status = validate(sb, bh);
if (status) {
put_bh(bh);
bhs[i] = NULL;
continue;
}
}
}
/* Always set the buffer in the cache, even if it was
* a forced read, or read-ahead which hasn't yet
* completed. */
ocfs2_set_buffer_uptodate(ci, bh);
}
ocfs2_metadata_cache_io_unlock(ci);
trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
flags, ignore_cache);
bail:
return status;
}
/* Check whether the blkno is the super block or one of the backups. */
static void ocfs2_check_super_or_backup(struct super_block *sb,
sector_t blkno)
{
int i;
u64 backup_blkno;
if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
return;
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
backup_blkno = ocfs2_backup_super_blkno(sb, i);
if (backup_blkno == blkno)
return;
}
BUG();
}
/*
* Write super block and backups doesn't need to collaborate with journal,
* so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
* into this function.
*/
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh)
{
int ret = 0;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
BUG_ON(buffer_jbd(bh));
ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
ret = -EROFS;
mlog_errno(ret);
goto out;
}
lock_buffer(bh);
set_buffer_uptodate(bh);
/* remove from dirty list before I/O. */
clear_buffer_dirty(bh);
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
submit_bh(WRITE, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ret = -EIO;
mlog_errno(ret);
}
out:
return ret;
}

77
fs/ocfs2/buffer_head_io.h Normal file
View file

@ -0,0 +1,77 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2_buffer_head.h
*
* Buffer cache handling functions defined
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_BUFFER_HEAD_IO_H
#define OCFS2_BUFFER_HEAD_IO_H
#include <linux/buffer_head.h>
void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
int uptodate);
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
struct ocfs2_caching_info *ci);
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[]);
/*
* If not NULL, validate() will be called on a buffer that is freshly
* read from disk. It will not be called if the buffer was in cache.
* Note that if validate() is being used for this buffer, it needs to
* be set even for a READAHEAD call, as it marks the buffer for later
* validation.
*/
int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh));
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh);
#define OCFS2_BH_IGNORE_CACHE 1
#define OCFS2_BH_READAHEAD 8
static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off,
struct buffer_head **bh,
int (*validate)(struct super_block *sb,
struct buffer_head *bh))
{
int status = 0;
if (bh == NULL) {
printk("ocfs2: bh == NULL\n");
status = -EINVAL;
goto bail;
}
status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate);
bail:
return status;
}
#endif /* OCFS2_BUFFER_HEAD_IO_H */

View file

@ -0,0 +1,4 @@
obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
quorum.o tcp.o netdebug.o

2678
fs/ocfs2/cluster/heartbeat.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,90 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* heartbeat.h
*
* Function prototypes
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef O2CLUSTER_HEARTBEAT_H
#define O2CLUSTER_HEARTBEAT_H
#include "ocfs2_heartbeat.h"
#define O2HB_REGION_TIMEOUT_MS 2000
#define O2HB_MAX_REGION_NAME_LEN 32
/* number of changes to be seen as live */
#define O2HB_LIVE_THRESHOLD 2
/* number of equal samples to be seen as dead */
extern unsigned int o2hb_dead_threshold;
#define O2HB_DEFAULT_DEAD_THRESHOLD 31
/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
#define O2HB_MIN_DEAD_THRESHOLD 2
#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
#define O2HB_CB_MAGIC 0x51d1e4ec
/* callback stuff */
enum o2hb_callback_type {
O2HB_NODE_DOWN_CB = 0,
O2HB_NODE_UP_CB,
O2HB_NUM_CB
};
struct o2nm_node;
typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
struct o2hb_callback_func {
u32 hc_magic;
struct list_head hc_item;
o2hb_cb_func *hc_func;
void *hc_data;
int hc_priority;
enum o2hb_callback_type hc_type;
};
struct config_group *o2hb_alloc_hb_set(void);
void o2hb_free_hb_set(struct config_group *group);
void o2hb_setup_callback(struct o2hb_callback_func *hc,
enum o2hb_callback_type type,
o2hb_cb_func *func,
void *data,
int priority);
int o2hb_register_callback(const char *region_uuid,
struct o2hb_callback_func *hc);
void o2hb_unregister_callback(const char *region_uuid,
struct o2hb_callback_func *hc);
void o2hb_fill_node_map(unsigned long *map,
unsigned bytes);
void o2hb_exit(void);
int o2hb_init(void);
int o2hb_check_node_heartbeating(u8 node_num);
int o2hb_check_node_heartbeating_no_sem(u8 node_num);
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
int o2hb_check_local_node_heartbeating(void);
void o2hb_stop_all_regions(void);
int o2hb_get_all_regions(char *region_uuids, u8 numregions);
int o2hb_global_heartbeat_active(void);
#endif /* O2CLUSTER_HEARTBEAT_H */

155
fs/ocfs2/cluster/masklog.c Normal file
View file

@ -0,0 +1,155 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2004, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <asm/uaccess.h>
#include "masklog.h"
struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
EXPORT_SYMBOL_GPL(mlog_and_bits);
struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
EXPORT_SYMBOL_GPL(mlog_not_bits);
static ssize_t mlog_mask_show(u64 mask, char *buf)
{
char *state;
if (__mlog_test_u64(mask, mlog_and_bits))
state = "allow";
else if (__mlog_test_u64(mask, mlog_not_bits))
state = "deny";
else
state = "off";
return snprintf(buf, PAGE_SIZE, "%s\n", state);
}
static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
{
if (!strncasecmp(buf, "allow", 5)) {
__mlog_set_u64(mask, mlog_and_bits);
__mlog_clear_u64(mask, mlog_not_bits);
} else if (!strncasecmp(buf, "deny", 4)) {
__mlog_set_u64(mask, mlog_not_bits);
__mlog_clear_u64(mask, mlog_and_bits);
} else if (!strncasecmp(buf, "off", 3)) {
__mlog_clear_u64(mask, mlog_not_bits);
__mlog_clear_u64(mask, mlog_and_bits);
} else
return -EINVAL;
return count;
}
struct mlog_attribute {
struct attribute attr;
u64 mask;
};
#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
#define define_mask(_name) { \
.attr = { \
.name = #_name, \
.mode = S_IRUGO | S_IWUSR, \
}, \
.mask = ML_##_name, \
}
static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(TCP),
define_mask(MSG),
define_mask(SOCKET),
define_mask(HEARTBEAT),
define_mask(HB_BIO),
define_mask(DLMFS),
define_mask(DLM),
define_mask(DLM_DOMAIN),
define_mask(DLM_THREAD),
define_mask(DLM_MASTER),
define_mask(DLM_RECOVERY),
define_mask(DLM_GLUE),
define_mask(VOTE),
define_mask(CONN),
define_mask(QUORUM),
define_mask(BASTS),
define_mask(CLUSTER),
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
};
static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
char *buf)
{
struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
return mlog_mask_show(mlog_attr->mask, buf);
}
static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
const char *buf, size_t count)
{
struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
return mlog_mask_store(mlog_attr->mask, buf, count);
}
static const struct sysfs_ops mlog_attr_ops = {
.show = mlog_show,
.store = mlog_store,
};
static struct kobj_type mlog_ktype = {
.default_attrs = mlog_attr_ptrs,
.sysfs_ops = &mlog_attr_ops,
};
static struct kset mlog_kset = {
.kobj = {.ktype = &mlog_ktype},
};
int mlog_sys_init(struct kset *o2cb_kset)
{
int i = 0;
while (mlog_attrs[i].attr.mode) {
mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
i++;
}
mlog_attr_ptrs[i] = NULL;
kobject_set_name(&mlog_kset.kobj, "logmask");
mlog_kset.kobj.kset = o2cb_kset;
return kset_register(&mlog_kset);
}
void mlog_sys_shutdown(void)
{
kset_unregister(&mlog_kset);
}

220
fs/ocfs2/cluster/masklog.h Normal file
View file

@ -0,0 +1,220 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef O2CLUSTER_MASKLOG_H
#define O2CLUSTER_MASKLOG_H
/*
* For now this is a trivial wrapper around printk() that gives the critical
* ability to enable sets of debugging output at run-time. In the future this
* will almost certainly be redirected to relayfs so that it can pay a
* substantially lower heisenberg tax.
*
* Callers associate the message with a bitmask and a global bitmask is
* maintained with help from /proc. If any of the bits match the message is
* output.
*
* We must have efficient bit tests on i386 and it seems gcc still emits crazy
* code for the 64bit compare. It emits very good code for the dual unsigned
* long tests, though, completely avoiding tests that can never pass if the
* caller gives a constant bitmask that fills one of the longs with all 0s. So
* the desire is to have almost all of the calls decided on by comparing just
* one of the longs. This leads to having infrequently given bits that are
* frequently matched in the high bits.
*
* _ERROR and _NOTICE are used for messages that always go to the console and
* have appropriate KERN_ prefixes. We wrap these in our function instead of
* just calling printk() so that this can eventually make its way through
* relayfs along with the debugging messages. Everything else gets KERN_DEBUG.
* The inline tests and macro dance give GCC the opportunity to quite cleverly
* only emit the appropriage printk() when the caller passes in a constant
* mask, as is almost always the case.
*
* All this bitmask nonsense is managed from the files under
* /sys/fs/o2cb/logmask/. Reading the files gives a straightforward
* indication of which bits are allowed (allow) or denied (off/deny).
* ENTRY deny
* EXIT deny
* TCP off
* MSG off
* SOCKET off
* ERROR allow
* NOTICE allow
*
* Writing changes the state of a given bit and requires a strictly formatted
* single write() call:
*
* write(fd, "allow", 5);
*
* Echoing allow/deny/off string into the logmask files can flip the bits
* on or off as expected; here is the bash script for example:
*
* log_mask="/sys/fs/o2cb/log_mask"
* for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
* echo allow >"$log_mask"/"$node"
* done
*
* The debugfs.ocfs2 tool can also flip the bits with the -l option:
*
* debugfs.ocfs2 -l TCP allow
*/
/* for task_struct */
#include <linux/sched.h>
/* bits that are frequently given and infrequently matched in the low word */
/* NOTE: If you add a flag, you need to also update masklog.c! */
#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */
#define ML_MSG 0x0000000000000002ULL /* net network messages */
#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */
#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */
#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */
#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */
#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */
#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */
#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */
#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */
#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */
#define ML_DLM_GLUE 0x0000000000000800ULL /* ocfs2 dlm glue layer */
#define ML_VOTE 0x0000000000001000ULL /* ocfs2 node messaging */
#define ML_CONN 0x0000000000002000ULL /* net connection management */
#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */
#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */
#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */
/* bits that are infrequently given and frequently matched in the high word */
#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */
#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */
#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */
#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
#ifndef MLOG_MASK_PREFIX
#define MLOG_MASK_PREFIX 0
#endif
/*
* When logging is disabled, force the bit test to 0 for anything other
* than errors and notices, allowing gcc to remove the code completely.
* When enabled, allow all masks.
*/
#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
#define ML_ALLOWED_BITS ~0
#else
#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE)
#endif
#define MLOG_MAX_BITS 64
struct mlog_bits {
unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
};
extern struct mlog_bits mlog_and_bits, mlog_not_bits;
#if BITS_PER_LONG == 32
#define __mlog_test_u64(mask, bits) \
( (u32)(mask & 0xffffffff) & bits.words[0] || \
((u64)(mask) >> 32) & bits.words[1] )
#define __mlog_set_u64(mask, bits) do { \
bits.words[0] |= (u32)(mask & 0xffffffff); \
bits.words[1] |= (u64)(mask) >> 32; \
} while (0)
#define __mlog_clear_u64(mask, bits) do { \
bits.words[0] &= ~((u32)(mask & 0xffffffff)); \
bits.words[1] &= ~((u64)(mask) >> 32); \
} while (0)
#define MLOG_BITS_RHS(mask) { \
{ \
[0] = (u32)(mask & 0xffffffff), \
[1] = (u64)(mask) >> 32, \
} \
}
#else /* 32bit long above, 64bit long below */
#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0])
#define __mlog_set_u64(mask, bits) do { \
bits.words[0] |= (mask); \
} while (0)
#define __mlog_clear_u64(mask, bits) do { \
bits.words[0] &= ~(mask); \
} while (0)
#define MLOG_BITS_RHS(mask) { { (mask) } }
#endif
/*
* smp_processor_id() "helpfully" screams when called outside preemptible
* regions in current kernels. sles doesn't have the variants that don't
* scream. just do this instead of trying to guess which we're building
* against.. *sigh*.
*/
#define __mlog_cpu_guess ({ \
unsigned long _cpu = get_cpu(); \
put_cpu(); \
_cpu; \
})
/* In the following two macros, the whitespace after the ',' just
* before ##args is intentional. Otherwise, gcc 2.95 will eat the
* previous token if args expands to nothing.
*/
#define __mlog_printk(level, fmt, args...) \
printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
task_pid_nr(current), __mlog_cpu_guess, \
__PRETTY_FUNCTION__, __LINE__ , ##args)
#define mlog(mask, fmt, args...) do { \
u64 __m = MLOG_MASK_PREFIX | (mask); \
if ((__m & ML_ALLOWED_BITS) && \
__mlog_test_u64(__m, mlog_and_bits) && \
!__mlog_test_u64(__m, mlog_not_bits)) { \
if (__m & ML_ERROR) \
__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
else if (__m & ML_NOTICE) \
__mlog_printk(KERN_NOTICE, fmt , ##args); \
else __mlog_printk(KERN_INFO, fmt , ##args); \
} \
} while (0)
#define mlog_errno(st) do { \
int _st = (st); \
if (_st != -ERESTARTSYS && _st != -EINTR && \
_st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \
_st != -EDQUOT) \
mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
} while (0)
#define mlog_bug_on_msg(cond, fmt, args...) do { \
if (cond) { \
mlog(ML_ERROR, "bug expression: " #cond "\n"); \
mlog(ML_ERROR, fmt, ##args); \
BUG(); \
} \
} while (0)
#include <linux/kobject.h>
#include <linux/sysfs.h>
int mlog_sys_init(struct kset *o2cb_subsys);
void mlog_sys_shutdown(void);
#endif /* O2CLUSTER_MASKLOG_H */

539
fs/ocfs2/cluster/netdebug.c Normal file
View file

@ -0,0 +1,539 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* netdebug.c
*
* debug functionality for o2net
*
* Copyright (C) 2005, 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifdef CONFIG_DEBUG_FS
#include <linux/module.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/kref.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include "tcp.h"
#include "nodemanager.h"
#define MLOG_MASK_PREFIX ML_TCP
#include "masklog.h"
#include "tcp_internal.h"
#define O2NET_DEBUG_DIR "o2net"
#define SC_DEBUG_NAME "sock_containers"
#define NST_DEBUG_NAME "send_tracking"
#define STATS_DEBUG_NAME "stats"
#define NODES_DEBUG_NAME "connected_nodes"
#define SHOW_SOCK_CONTAINERS 0
#define SHOW_SOCK_STATS 1
static struct dentry *o2net_dentry;
static struct dentry *sc_dentry;
static struct dentry *nst_dentry;
static struct dentry *stats_dentry;
static struct dentry *nodes_dentry;
static DEFINE_SPINLOCK(o2net_debug_lock);
static LIST_HEAD(sock_containers);
static LIST_HEAD(send_tracking);
void o2net_debug_add_nst(struct o2net_send_tracking *nst)
{
spin_lock(&o2net_debug_lock);
list_add(&nst->st_net_debug_item, &send_tracking);
spin_unlock(&o2net_debug_lock);
}
void o2net_debug_del_nst(struct o2net_send_tracking *nst)
{
spin_lock(&o2net_debug_lock);
if (!list_empty(&nst->st_net_debug_item))
list_del_init(&nst->st_net_debug_item);
spin_unlock(&o2net_debug_lock);
}
static struct o2net_send_tracking
*next_nst(struct o2net_send_tracking *nst_start)
{
struct o2net_send_tracking *nst, *ret = NULL;
assert_spin_locked(&o2net_debug_lock);
list_for_each_entry(nst, &nst_start->st_net_debug_item,
st_net_debug_item) {
/* discover the head of the list */
if (&nst->st_net_debug_item == &send_tracking)
break;
/* use st_task to detect real nsts in the list */
if (nst->st_task != NULL) {
ret = nst;
break;
}
}
return ret;
}
static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
{
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
spin_lock(&o2net_debug_lock);
nst = next_nst(dummy_nst);
spin_unlock(&o2net_debug_lock);
return nst;
}
static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
spin_lock(&o2net_debug_lock);
nst = next_nst(dummy_nst);
list_del_init(&dummy_nst->st_net_debug_item);
if (nst)
list_add(&dummy_nst->st_net_debug_item,
&nst->st_net_debug_item);
spin_unlock(&o2net_debug_lock);
return nst; /* unused, just needs to be null when done */
}
static int nst_seq_show(struct seq_file *seq, void *v)
{
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
ktime_t now;
s64 sock, send, status;
spin_lock(&o2net_debug_lock);
nst = next_nst(dummy_nst);
if (!nst)
goto out;
now = ktime_get();
sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
send = ktime_to_us(ktime_sub(now, nst->st_send_time));
status = ktime_to_us(ktime_sub(now, nst->st_status_time));
/* get_task_comm isn't exported. oh well. */
seq_printf(seq, "%p:\n"
" pid: %lu\n"
" tgid: %lu\n"
" process name: %s\n"
" node: %u\n"
" sc: %p\n"
" message id: %d\n"
" message type: %u\n"
" message key: 0x%08x\n"
" sock acquiry: %lld usecs ago\n"
" send start: %lld usecs ago\n"
" wait start: %lld usecs ago\n",
nst, (unsigned long)task_pid_nr(nst->st_task),
(unsigned long)nst->st_task->tgid,
nst->st_task->comm, nst->st_node,
nst->st_sc, nst->st_id, nst->st_msg_type,
nst->st_msg_key,
(long long)sock,
(long long)send,
(long long)status);
out:
spin_unlock(&o2net_debug_lock);
return 0;
}
static void nst_seq_stop(struct seq_file *seq, void *v)
{
}
static const struct seq_operations nst_seq_ops = {
.start = nst_seq_start,
.next = nst_seq_next,
.stop = nst_seq_stop,
.show = nst_seq_show,
};
static int nst_fop_open(struct inode *inode, struct file *file)
{
struct o2net_send_tracking *dummy_nst;
dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst));
if (!dummy_nst)
return -ENOMEM;
o2net_debug_add_nst(dummy_nst);
return 0;
}
static int nst_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
struct o2net_send_tracking *dummy_nst = seq->private;
o2net_debug_del_nst(dummy_nst);
return seq_release_private(inode, file);
}
static const struct file_operations nst_seq_fops = {
.open = nst_fop_open,
.read = seq_read,
.llseek = seq_lseek,
.release = nst_fop_release,
};
void o2net_debug_add_sc(struct o2net_sock_container *sc)
{
spin_lock(&o2net_debug_lock);
list_add(&sc->sc_net_debug_item, &sock_containers);
spin_unlock(&o2net_debug_lock);
}
void o2net_debug_del_sc(struct o2net_sock_container *sc)
{
spin_lock(&o2net_debug_lock);
list_del_init(&sc->sc_net_debug_item);
spin_unlock(&o2net_debug_lock);
}
struct o2net_sock_debug {
int dbg_ctxt;
struct o2net_sock_container *dbg_sock;
};
static struct o2net_sock_container
*next_sc(struct o2net_sock_container *sc_start)
{
struct o2net_sock_container *sc, *ret = NULL;
assert_spin_locked(&o2net_debug_lock);
list_for_each_entry(sc, &sc_start->sc_net_debug_item,
sc_net_debug_item) {
/* discover the head of the list miscast as a sc */
if (&sc->sc_net_debug_item == &sock_containers)
break;
/* use sc_page to detect real scs in the list */
if (sc->sc_page != NULL) {
ret = sc;
break;
}
}
return ret;
}
static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
{
struct o2net_sock_debug *sd = seq->private;
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
spin_unlock(&o2net_debug_lock);
return sc;
}
static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct o2net_sock_debug *sd = seq->private;
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
list_del_init(&dummy_sc->sc_net_debug_item);
if (sc)
list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
spin_unlock(&o2net_debug_lock);
return sc; /* unused, just needs to be null when done */
}
#ifdef CONFIG_OCFS2_FS_STATS
# define sc_send_count(_s) ((_s)->sc_send_count)
# define sc_recv_count(_s) ((_s)->sc_recv_count)
# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total))
# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total))
# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total))
# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total))
#else
# define sc_send_count(_s) (0U)
# define sc_recv_count(_s) (0U)
# define sc_tv_acquiry_total_ns(_s) (0LL)
# define sc_tv_send_total_ns(_s) (0LL)
# define sc_tv_status_total_ns(_s) (0LL)
# define sc_tv_process_total_ns(_s) (0LL)
#endif
/* So that debugfs.ocfs2 can determine which format is being used */
#define O2NET_STATS_STR_VERSION 1
static void sc_show_sock_stats(struct seq_file *seq,
struct o2net_sock_container *sc)
{
if (!sc)
return;
seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
(long long)sc_tv_acquiry_total_ns(sc),
(long long)sc_tv_send_total_ns(sc),
(long long)sc_tv_status_total_ns(sc),
(unsigned long)sc_recv_count(sc),
(long long)sc_tv_process_total_ns(sc));
}
static void sc_show_sock_container(struct seq_file *seq,
struct o2net_sock_container *sc)
{
struct inet_sock *inet = NULL;
__be32 saddr = 0, daddr = 0;
__be16 sport = 0, dport = 0;
if (!sc)
return;
if (sc->sc_sock) {
inet = inet_sk(sc->sc_sock->sk);
/* the stack's structs aren't sparse endian clean */
saddr = (__force __be32)inet->inet_saddr;
daddr = (__force __be32)inet->inet_daddr;
sport = (__force __be16)inet->inet_sport;
dport = (__force __be16)inet->inet_dport;
}
/* XXX sigh, inet-> doesn't have sparse annotation so any
* use of it here generates a warning with -Wbitwise */
seq_printf(seq, "%p:\n"
" krefs: %d\n"
" sock: %pI4:%u -> "
"%pI4:%u\n"
" remote node: %s\n"
" page off: %zu\n"
" handshake ok: %u\n"
" timer: %lld usecs\n"
" data ready: %lld usecs\n"
" advance start: %lld usecs\n"
" advance stop: %lld usecs\n"
" func start: %lld usecs\n"
" func stop: %lld usecs\n"
" func key: 0x%08x\n"
" func type: %u\n",
sc,
atomic_read(&sc->sc_kref.refcount),
&saddr, inet ? ntohs(sport) : 0,
&daddr, inet ? ntohs(dport) : 0,
sc->sc_node->nd_name,
sc->sc_page_off,
sc->sc_handshake_ok,
(long long)ktime_to_us(sc->sc_tv_timer),
(long long)ktime_to_us(sc->sc_tv_data_ready),
(long long)ktime_to_us(sc->sc_tv_advance_start),
(long long)ktime_to_us(sc->sc_tv_advance_stop),
(long long)ktime_to_us(sc->sc_tv_func_start),
(long long)ktime_to_us(sc->sc_tv_func_stop),
sc->sc_msg_key,
sc->sc_msg_type);
}
static int sc_seq_show(struct seq_file *seq, void *v)
{
struct o2net_sock_debug *sd = seq->private;
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
if (sc) {
if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
sc_show_sock_container(seq, sc);
else
sc_show_sock_stats(seq, sc);
}
spin_unlock(&o2net_debug_lock);
return 0;
}
static void sc_seq_stop(struct seq_file *seq, void *v)
{
}
static const struct seq_operations sc_seq_ops = {
.start = sc_seq_start,
.next = sc_seq_next,
.stop = sc_seq_stop,
.show = sc_seq_show,
};
static int sc_common_open(struct file *file, int ctxt)
{
struct o2net_sock_debug *sd;
struct o2net_sock_container *dummy_sc;
dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL);
if (!dummy_sc)
return -ENOMEM;
sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd));
if (!sd) {
kfree(dummy_sc);
return -ENOMEM;
}
sd->dbg_ctxt = ctxt;
sd->dbg_sock = dummy_sc;
o2net_debug_add_sc(dummy_sc);
return 0;
}
static int sc_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
struct o2net_sock_debug *sd = seq->private;
struct o2net_sock_container *dummy_sc = sd->dbg_sock;
o2net_debug_del_sc(dummy_sc);
return seq_release_private(inode, file);
}
static int stats_fop_open(struct inode *inode, struct file *file)
{
return sc_common_open(file, SHOW_SOCK_STATS);
}
static const struct file_operations stats_seq_fops = {
.open = stats_fop_open,
.read = seq_read,
.llseek = seq_lseek,
.release = sc_fop_release,
};
static int sc_fop_open(struct inode *inode, struct file *file)
{
return sc_common_open(file, SHOW_SOCK_CONTAINERS);
}
static const struct file_operations sc_seq_fops = {
.open = sc_fop_open,
.read = seq_read,
.llseek = seq_lseek,
.release = sc_fop_release,
};
static int o2net_fill_bitmap(char *buf, int len)
{
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
int i = -1, out = 0;
o2net_fill_node_map(map, sizeof(map));
while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
out += snprintf(buf + out, PAGE_SIZE - out, "\n");
return out;
}
static int nodes_fop_open(struct inode *inode, struct file *file)
{
char *buf;
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
return -ENOMEM;
i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
file->private_data = buf;
return 0;
}
static int o2net_debug_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}
static ssize_t o2net_debug_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos)
{
return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
i_size_read(file->f_mapping->host));
}
static const struct file_operations nodes_fops = {
.open = nodes_fop_open,
.release = o2net_debug_release,
.read = o2net_debug_read,
.llseek = generic_file_llseek,
};
void o2net_debugfs_exit(void)
{
debugfs_remove(nodes_dentry);
debugfs_remove(stats_dentry);
debugfs_remove(sc_dentry);
debugfs_remove(nst_dentry);
debugfs_remove(o2net_dentry);
}
int o2net_debugfs_init(void)
{
umode_t mode = S_IFREG|S_IRUSR;
o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
if (o2net_dentry)
nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
o2net_dentry, NULL, &nst_seq_fops);
if (nst_dentry)
sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
o2net_dentry, NULL, &sc_seq_fops);
if (sc_dentry)
stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
o2net_dentry, NULL, &stats_seq_fops);
if (stats_dentry)
nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
o2net_dentry, NULL, &nodes_fops);
if (nodes_dentry)
return 0;
o2net_debugfs_exit();
mlog_errno(-ENOMEM);
return -ENOMEM;
}
#endif /* CONFIG_DEBUG_FS */

View file

@ -0,0 +1,987 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2004, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/configfs.h>
#include "tcp.h"
#include "nodemanager.h"
#include "heartbeat.h"
#include "masklog.h"
#include "sys.h"
/* for now we operate under the assertion that there can be only one
* cluster active at a time. Changing this will require trickling
* cluster references throughout where nodes are looked up */
struct o2nm_cluster *o2nm_single_cluster = NULL;
char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
"reset", /* O2NM_FENCE_RESET */
"panic", /* O2NM_FENCE_PANIC */
};
struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
{
struct o2nm_node *node = NULL;
if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
goto out;
read_lock(&o2nm_single_cluster->cl_nodes_lock);
node = o2nm_single_cluster->cl_nodes[node_num];
if (node)
config_item_get(&node->nd_item);
read_unlock(&o2nm_single_cluster->cl_nodes_lock);
out:
return node;
}
EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
{
struct o2nm_cluster *cluster = o2nm_single_cluster;
BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
if (cluster == NULL)
return -EINVAL;
read_lock(&cluster->cl_nodes_lock);
memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
read_unlock(&cluster->cl_nodes_lock);
return 0;
}
EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
__be32 ip_needle,
struct rb_node ***ret_p,
struct rb_node **ret_parent)
{
struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
struct rb_node *parent = NULL;
struct o2nm_node *node, *ret = NULL;
while (*p) {
int cmp;
parent = *p;
node = rb_entry(parent, struct o2nm_node, nd_ip_node);
cmp = memcmp(&ip_needle, &node->nd_ipv4_address,
sizeof(ip_needle));
if (cmp < 0)
p = &(*p)->rb_left;
else if (cmp > 0)
p = &(*p)->rb_right;
else {
ret = node;
break;
}
}
if (ret_p != NULL)
*ret_p = p;
if (ret_parent != NULL)
*ret_parent = parent;
return ret;
}
struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
{
struct o2nm_node *node = NULL;
struct o2nm_cluster *cluster = o2nm_single_cluster;
if (cluster == NULL)
goto out;
read_lock(&cluster->cl_nodes_lock);
node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
if (node)
config_item_get(&node->nd_item);
read_unlock(&cluster->cl_nodes_lock);
out:
return node;
}
EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
void o2nm_node_put(struct o2nm_node *node)
{
config_item_put(&node->nd_item);
}
EXPORT_SYMBOL_GPL(o2nm_node_put);
void o2nm_node_get(struct o2nm_node *node)
{
config_item_get(&node->nd_item);
}
EXPORT_SYMBOL_GPL(o2nm_node_get);
u8 o2nm_this_node(void)
{
u8 node_num = O2NM_MAX_NODES;
if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
node_num = o2nm_single_cluster->cl_local_node;
return node_num;
}
EXPORT_SYMBOL_GPL(o2nm_this_node);
/* node configfs bits */
static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
{
return item ?
container_of(to_config_group(item), struct o2nm_cluster,
cl_group)
: NULL;
}
static struct o2nm_node *to_o2nm_node(struct config_item *item)
{
return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
}
static void o2nm_node_release(struct config_item *item)
{
struct o2nm_node *node = to_o2nm_node(item);
kfree(node);
}
static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
{
return sprintf(page, "%d\n", node->nd_num);
}
static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
{
/* through the first node_set .parent
* mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
}
enum {
O2NM_NODE_ATTR_NUM = 0,
O2NM_NODE_ATTR_PORT,
O2NM_NODE_ATTR_ADDRESS,
O2NM_NODE_ATTR_LOCAL,
};
static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
size_t count)
{
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
unsigned long tmp;
char *p = (char *)page;
tmp = simple_strtoul(p, &p, 0);
if (!p || (*p && (*p != '\n')))
return -EINVAL;
if (tmp >= O2NM_MAX_NODES)
return -ERANGE;
/* once we're in the cl_nodes tree networking can look us up by
* node number and try to use our address and port attributes
* to connect to this node.. make sure that they've been set
* before writing the node attribute? */
if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
!test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
return -EINVAL; /* XXX */
write_lock(&cluster->cl_nodes_lock);
if (cluster->cl_nodes[tmp])
p = NULL;
else {
cluster->cl_nodes[tmp] = node;
node->nd_num = tmp;
set_bit(tmp, cluster->cl_nodes_bitmap);
}
write_unlock(&cluster->cl_nodes_lock);
if (p == NULL)
return -EEXIST;
return count;
}
static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
{
return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
}
static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
const char *page, size_t count)
{
unsigned long tmp;
char *p = (char *)page;
tmp = simple_strtoul(p, &p, 0);
if (!p || (*p && (*p != '\n')))
return -EINVAL;
if (tmp == 0)
return -EINVAL;
if (tmp >= (u16)-1)
return -ERANGE;
node->nd_ipv4_port = htons(tmp);
return count;
}
static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
{
return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
}
static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
const char *page,
size_t count)
{
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
int ret, i;
struct rb_node **p, *parent;
unsigned int octets[4];
__be32 ipv4_addr = 0;
ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
&octets[1], &octets[0]);
if (ret != 4)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(octets); i++) {
if (octets[i] > 255)
return -ERANGE;
be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
}
ret = 0;
write_lock(&cluster->cl_nodes_lock);
if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
ret = -EEXIST;
else {
rb_link_node(&node->nd_ip_node, parent, p);
rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
}
write_unlock(&cluster->cl_nodes_lock);
if (ret)
return ret;
memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
return count;
}
static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
{
return sprintf(page, "%d\n", node->nd_local);
}
static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
size_t count)
{
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
unsigned long tmp;
char *p = (char *)page;
ssize_t ret;
tmp = simple_strtoul(p, &p, 0);
if (!p || (*p && (*p != '\n')))
return -EINVAL;
tmp = !!tmp; /* boolean of whether this node wants to be local */
/* setting local turns on networking rx for now so we require having
* set everything else first */
if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
!test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
!test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
return -EINVAL; /* XXX */
/* the only failure case is trying to set a new local node
* when a different one is already set */
if (tmp && tmp == cluster->cl_has_local &&
cluster->cl_local_node != node->nd_num)
return -EBUSY;
/* bring up the rx thread if we're setting the new local node. */
if (tmp && !cluster->cl_has_local) {
ret = o2net_start_listening(node);
if (ret)
return ret;
}
if (!tmp && cluster->cl_has_local &&
cluster->cl_local_node == node->nd_num) {
o2net_stop_listening(node);
cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
}
node->nd_local = tmp;
if (node->nd_local) {
cluster->cl_has_local = tmp;
cluster->cl_local_node = node->nd_num;
}
return count;
}
struct o2nm_node_attribute {
struct configfs_attribute attr;
ssize_t (*show)(struct o2nm_node *, char *);
ssize_t (*store)(struct o2nm_node *, const char *, size_t);
};
static struct o2nm_node_attribute o2nm_node_attr_num = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "num",
.ca_mode = S_IRUGO | S_IWUSR },
.show = o2nm_node_num_read,
.store = o2nm_node_num_write,
};
static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "ipv4_port",
.ca_mode = S_IRUGO | S_IWUSR },
.show = o2nm_node_ipv4_port_read,
.store = o2nm_node_ipv4_port_write,
};
static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "ipv4_address",
.ca_mode = S_IRUGO | S_IWUSR },
.show = o2nm_node_ipv4_address_read,
.store = o2nm_node_ipv4_address_write,
};
static struct o2nm_node_attribute o2nm_node_attr_local = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "local",
.ca_mode = S_IRUGO | S_IWUSR },
.show = o2nm_node_local_read,
.store = o2nm_node_local_write,
};
static struct configfs_attribute *o2nm_node_attrs[] = {
[O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
[O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
[O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
[O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
NULL,
};
static int o2nm_attr_index(struct configfs_attribute *attr)
{
int i;
for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
if (attr == o2nm_node_attrs[i])
return i;
}
BUG();
return 0;
}
static ssize_t o2nm_node_show(struct config_item *item,
struct configfs_attribute *attr,
char *page)
{
struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_node_attribute *o2nm_node_attr =
container_of(attr, struct o2nm_node_attribute, attr);
ssize_t ret = 0;
if (o2nm_node_attr->show)
ret = o2nm_node_attr->show(node, page);
return ret;
}
static ssize_t o2nm_node_store(struct config_item *item,
struct configfs_attribute *attr,
const char *page, size_t count)
{
struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_node_attribute *o2nm_node_attr =
container_of(attr, struct o2nm_node_attribute, attr);
ssize_t ret;
int attr_index = o2nm_attr_index(attr);
if (o2nm_node_attr->store == NULL) {
ret = -EINVAL;
goto out;
}
if (test_bit(attr_index, &node->nd_set_attributes))
return -EBUSY;
ret = o2nm_node_attr->store(node, page, count);
if (ret < count)
goto out;
set_bit(attr_index, &node->nd_set_attributes);
out:
return ret;
}
static struct configfs_item_operations o2nm_node_item_ops = {
.release = o2nm_node_release,
.show_attribute = o2nm_node_show,
.store_attribute = o2nm_node_store,
};
static struct config_item_type o2nm_node_type = {
.ct_item_ops = &o2nm_node_item_ops,
.ct_attrs = o2nm_node_attrs,
.ct_owner = THIS_MODULE,
};
/* node set */
struct o2nm_node_group {
struct config_group ns_group;
/* some stuff? */
};
#if 0
static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
{
return group ?
container_of(group, struct o2nm_node_group, ns_group)
: NULL;
}
#endif
struct o2nm_cluster_attribute {
struct configfs_attribute attr;
ssize_t (*show)(struct o2nm_cluster *, char *);
ssize_t (*store)(struct o2nm_cluster *, const char *, size_t);
};
static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
unsigned int *val)
{
unsigned long tmp;
char *p = (char *)page;
tmp = simple_strtoul(p, &p, 0);
if (!p || (*p && (*p != '\n')))
return -EINVAL;
if (tmp == 0)
return -EINVAL;
if (tmp >= (u32)-1)
return -ERANGE;
*val = tmp;
return count;
}
static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
struct o2nm_cluster *cluster, char *page)
{
return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
}
static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
struct o2nm_cluster *cluster, const char *page, size_t count)
{
ssize_t ret;
unsigned int val;
ret = o2nm_cluster_attr_write(page, count, &val);
if (ret > 0) {
if (cluster->cl_idle_timeout_ms != val
&& o2net_num_connected_peers()) {
mlog(ML_NOTICE,
"o2net: cannot change idle timeout after "
"the first peer has agreed to it."
" %d connected peers\n",
o2net_num_connected_peers());
ret = -EINVAL;
} else if (val <= cluster->cl_keepalive_delay_ms) {
mlog(ML_NOTICE, "o2net: idle timeout must be larger "
"than keepalive delay\n");
ret = -EINVAL;
} else {
cluster->cl_idle_timeout_ms = val;
}
}
return ret;
}
static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
struct o2nm_cluster *cluster, char *page)
{
return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
}
static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
struct o2nm_cluster *cluster, const char *page, size_t count)
{
ssize_t ret;
unsigned int val;
ret = o2nm_cluster_attr_write(page, count, &val);
if (ret > 0) {
if (cluster->cl_keepalive_delay_ms != val
&& o2net_num_connected_peers()) {
mlog(ML_NOTICE,
"o2net: cannot change keepalive delay after"
" the first peer has agreed to it."
" %d connected peers\n",
o2net_num_connected_peers());
ret = -EINVAL;
} else if (val >= cluster->cl_idle_timeout_ms) {
mlog(ML_NOTICE, "o2net: keepalive delay must be "
"smaller than idle timeout\n");
ret = -EINVAL;
} else {
cluster->cl_keepalive_delay_ms = val;
}
}
return ret;
}
static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
struct o2nm_cluster *cluster, char *page)
{
return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
}
static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
struct o2nm_cluster *cluster, const char *page, size_t count)
{
return o2nm_cluster_attr_write(page, count,
&cluster->cl_reconnect_delay_ms);
}
static ssize_t o2nm_cluster_attr_fence_method_read(
struct o2nm_cluster *cluster, char *page)
{
ssize_t ret = 0;
if (cluster)
ret = sprintf(page, "%s\n",
o2nm_fence_method_desc[cluster->cl_fence_method]);
return ret;
}
static ssize_t o2nm_cluster_attr_fence_method_write(
struct o2nm_cluster *cluster, const char *page, size_t count)
{
unsigned int i;
if (page[count - 1] != '\n')
goto bail;
for (i = 0; i < O2NM_FENCE_METHODS; ++i) {
if (count != strlen(o2nm_fence_method_desc[i]) + 1)
continue;
if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
continue;
if (cluster->cl_fence_method != i) {
printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
o2nm_fence_method_desc[i]);
cluster->cl_fence_method = i;
}
return count;
}
bail:
return -EINVAL;
}
static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "idle_timeout_ms",
.ca_mode = S_IRUGO | S_IWUSR },
.show = o2nm_cluster_attr_idle_timeout_ms_read,
.store = o2nm_cluster_attr_idle_timeout_ms_write,
};
static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "keepalive_delay_ms",
.ca_mode = S_IRUGO | S_IWUSR },
.show = o2nm_cluster_attr_keepalive_delay_ms_read,
.store = o2nm_cluster_attr_keepalive_delay_ms_write,
};
static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "reconnect_delay_ms",
.ca_mode = S_IRUGO | S_IWUSR },
.show = o2nm_cluster_attr_reconnect_delay_ms_read,
.store = o2nm_cluster_attr_reconnect_delay_ms_write,
};
static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "fence_method",
.ca_mode = S_IRUGO | S_IWUSR },
.show = o2nm_cluster_attr_fence_method_read,
.store = o2nm_cluster_attr_fence_method_write,
};
static struct configfs_attribute *o2nm_cluster_attrs[] = {
&o2nm_cluster_attr_idle_timeout_ms.attr,
&o2nm_cluster_attr_keepalive_delay_ms.attr,
&o2nm_cluster_attr_reconnect_delay_ms.attr,
&o2nm_cluster_attr_fence_method.attr,
NULL,
};
static ssize_t o2nm_cluster_show(struct config_item *item,
struct configfs_attribute *attr,
char *page)
{
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
struct o2nm_cluster_attribute *o2nm_cluster_attr =
container_of(attr, struct o2nm_cluster_attribute, attr);
ssize_t ret = 0;
if (o2nm_cluster_attr->show)
ret = o2nm_cluster_attr->show(cluster, page);
return ret;
}
static ssize_t o2nm_cluster_store(struct config_item *item,
struct configfs_attribute *attr,
const char *page, size_t count)
{
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
struct o2nm_cluster_attribute *o2nm_cluster_attr =
container_of(attr, struct o2nm_cluster_attribute, attr);
ssize_t ret;
if (o2nm_cluster_attr->store == NULL) {
ret = -EINVAL;
goto out;
}
ret = o2nm_cluster_attr->store(cluster, page, count);
if (ret < count)
goto out;
out:
return ret;
}
static struct config_item *o2nm_node_group_make_item(struct config_group *group,
const char *name)
{
struct o2nm_node *node = NULL;
if (strlen(name) > O2NM_MAX_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
if (node == NULL)
return ERR_PTR(-ENOMEM);
strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
spin_lock_init(&node->nd_lock);
mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
return &node->nd_item;
}
static void o2nm_node_group_drop_item(struct config_group *group,
struct config_item *item)
{
struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
o2net_disconnect_node(node);
if (cluster->cl_has_local &&
(cluster->cl_local_node == node->nd_num)) {
cluster->cl_has_local = 0;
cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
o2net_stop_listening(node);
}
/* XXX call into net to stop this node from trading messages */
write_lock(&cluster->cl_nodes_lock);
/* XXX sloppy */
if (node->nd_ipv4_address)
rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
/* nd_num might be 0 if the node number hasn't been set.. */
if (cluster->cl_nodes[node->nd_num] == node) {
cluster->cl_nodes[node->nd_num] = NULL;
clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
}
write_unlock(&cluster->cl_nodes_lock);
mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
config_item_name(&node->nd_item));
config_item_put(item);
}
static struct configfs_group_operations o2nm_node_group_group_ops = {
.make_item = o2nm_node_group_make_item,
.drop_item = o2nm_node_group_drop_item,
};
static struct config_item_type o2nm_node_group_type = {
.ct_group_ops = &o2nm_node_group_group_ops,
.ct_owner = THIS_MODULE,
};
/* cluster */
static void o2nm_cluster_release(struct config_item *item)
{
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
kfree(cluster->cl_group.default_groups);
kfree(cluster);
}
static struct configfs_item_operations o2nm_cluster_item_ops = {
.release = o2nm_cluster_release,
.show_attribute = o2nm_cluster_show,
.store_attribute = o2nm_cluster_store,
};
static struct config_item_type o2nm_cluster_type = {
.ct_item_ops = &o2nm_cluster_item_ops,
.ct_attrs = o2nm_cluster_attrs,
.ct_owner = THIS_MODULE,
};
/* cluster set */
struct o2nm_cluster_group {
struct configfs_subsystem cs_subsys;
/* some stuff? */
};
#if 0
static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
{
return group ?
container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
: NULL;
}
#endif
static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
const char *name)
{
struct o2nm_cluster *cluster = NULL;
struct o2nm_node_group *ns = NULL;
struct config_group *o2hb_group = NULL, *ret = NULL;
void *defs = NULL;
/* this runs under the parent dir's i_mutex; there can be only
* one caller in here at a time */
if (o2nm_single_cluster)
return ERR_PTR(-ENOSPC);
cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
o2hb_group = o2hb_alloc_hb_set();
if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
goto out;
config_group_init_type_name(&cluster->cl_group, name,
&o2nm_cluster_type);
config_group_init_type_name(&ns->ns_group, "node",
&o2nm_node_group_type);
cluster->cl_group.default_groups = defs;
cluster->cl_group.default_groups[0] = &ns->ns_group;
cluster->cl_group.default_groups[1] = o2hb_group;
cluster->cl_group.default_groups[2] = NULL;
rwlock_init(&cluster->cl_nodes_lock);
cluster->cl_node_ip_tree = RB_ROOT;
cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
cluster->cl_fence_method = O2NM_FENCE_RESET;
ret = &cluster->cl_group;
o2nm_single_cluster = cluster;
out:
if (ret == NULL) {
kfree(cluster);
kfree(ns);
o2hb_free_hb_set(o2hb_group);
kfree(defs);
ret = ERR_PTR(-ENOMEM);
}
return ret;
}
static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
{
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
int i;
struct config_item *killme;
BUG_ON(o2nm_single_cluster != cluster);
o2nm_single_cluster = NULL;
for (i = 0; cluster->cl_group.default_groups[i]; i++) {
killme = &cluster->cl_group.default_groups[i]->cg_item;
cluster->cl_group.default_groups[i] = NULL;
config_item_put(killme);
}
config_item_put(item);
}
static struct configfs_group_operations o2nm_cluster_group_group_ops = {
.make_group = o2nm_cluster_group_make_group,
.drop_item = o2nm_cluster_group_drop_item,
};
static struct config_item_type o2nm_cluster_group_type = {
.ct_group_ops = &o2nm_cluster_group_group_ops,
.ct_owner = THIS_MODULE,
};
static struct o2nm_cluster_group o2nm_cluster_group = {
.cs_subsys = {
.su_group = {
.cg_item = {
.ci_namebuf = "cluster",
.ci_type = &o2nm_cluster_group_type,
},
},
},
};
int o2nm_depend_item(struct config_item *item)
{
return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
}
void o2nm_undepend_item(struct config_item *item)
{
configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
}
int o2nm_depend_this_node(void)
{
int ret = 0;
struct o2nm_node *local_node;
local_node = o2nm_get_node_by_num(o2nm_this_node());
if (!local_node) {
ret = -EINVAL;
goto out;
}
ret = o2nm_depend_item(&local_node->nd_item);
o2nm_node_put(local_node);
out:
return ret;
}
void o2nm_undepend_this_node(void)
{
struct o2nm_node *local_node;
local_node = o2nm_get_node_by_num(o2nm_this_node());
BUG_ON(!local_node);
o2nm_undepend_item(&local_node->nd_item);
o2nm_node_put(local_node);
}
static void __exit exit_o2nm(void)
{
/* XXX sync with hb callbacks and shut down hb? */
o2net_unregister_hb_callbacks();
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
o2cb_sys_shutdown();
o2net_exit();
o2hb_exit();
}
static int __init init_o2nm(void)
{
int ret = -1;
ret = o2hb_init();
if (ret)
goto out;
ret = o2net_init();
if (ret)
goto out_o2hb;
ret = o2net_register_hb_callbacks();
if (ret)
goto out_o2net;
config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
if (ret) {
printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
goto out_callbacks;
}
ret = o2cb_sys_init();
if (!ret)
goto out;
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
out_callbacks:
o2net_unregister_hb_callbacks();
out_o2net:
o2net_exit();
out_o2hb:
o2hb_exit();
out:
return ret;
}
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("OCFS2 cluster management");
module_init(init_o2nm)
module_exit(exit_o2nm)

View file

@ -0,0 +1,88 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* nodemanager.h
*
* Function prototypes
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef O2CLUSTER_NODEMANAGER_H
#define O2CLUSTER_NODEMANAGER_H
#include "ocfs2_nodemanager.h"
/* This totally doesn't belong here. */
#include <linux/configfs.h>
#include <linux/rbtree.h>
enum o2nm_fence_method {
O2NM_FENCE_RESET = 0,
O2NM_FENCE_PANIC,
O2NM_FENCE_METHODS, /* Number of fence methods */
};
struct o2nm_node {
spinlock_t nd_lock;
struct config_item nd_item;
char nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
__u8 nd_num;
/* only one address per node, as attributes, for now. */
__be32 nd_ipv4_address;
__be16 nd_ipv4_port;
struct rb_node nd_ip_node;
/* there can be only one local node for now */
int nd_local;
unsigned long nd_set_attributes;
};
struct o2nm_cluster {
struct config_group cl_group;
unsigned cl_has_local:1;
u8 cl_local_node;
rwlock_t cl_nodes_lock;
struct o2nm_node *cl_nodes[O2NM_MAX_NODES];
struct rb_root cl_node_ip_tree;
unsigned int cl_idle_timeout_ms;
unsigned int cl_keepalive_delay_ms;
unsigned int cl_reconnect_delay_ms;
enum o2nm_fence_method cl_fence_method;
/* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
};
extern struct o2nm_cluster *o2nm_single_cluster;
u8 o2nm_this_node(void);
int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
void o2nm_node_get(struct o2nm_node *node);
void o2nm_node_put(struct o2nm_node *node);
int o2nm_depend_item(struct config_item *item);
void o2nm_undepend_item(struct config_item *item);
int o2nm_depend_this_node(void);
void o2nm_undepend_this_node(void);
#endif /* O2CLUSTER_NODEMANAGER_H */

View file

@ -0,0 +1,38 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2_heartbeat.h
*
* On-disk structures for ocfs2_heartbeat
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef _OCFS2_HEARTBEAT_H
#define _OCFS2_HEARTBEAT_H
struct o2hb_disk_heartbeat_block {
__le64 hb_seq;
__u8 hb_node;
__u8 hb_pad1[3];
__le32 hb_cksum;
__le64 hb_generation;
__le32 hb_dead_ms;
};
#endif /* _OCFS2_HEARTBEAT_H */

View file

@ -0,0 +1,45 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2_nodemanager.h
*
* Header describing the interface between userspace and the kernel
* for the ocfs2_nodemanager module.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef _OCFS2_NODEMANAGER_H
#define _OCFS2_NODEMANAGER_H
#define O2NM_API_VERSION 5
#define O2NM_MAX_NODES 255
#define O2NM_INVALID_NODE_NUM 255
/* host name, group name, cluster name all 64 bytes */
#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
/*
* Maximum number of global heartbeat regions allowed.
* **CAUTION** Changing this number will break dlm compatibility.
*/
#define O2NM_MAX_REGIONS 32
#endif /* _OCFS2_NODEMANAGER_H */

340
fs/ocfs2/cluster/quorum.c Normal file
View file

@ -0,0 +1,340 @@
/* -*- mode: c; c-basic-offset: 8; -*-
*
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
/* This quorum hack is only here until we transition to some more rational
* approach that is driven from userspace. Honest. No foolin'.
*
* Imagine two nodes lose network connectivity to each other but they're still
* up and operating in every other way. Presumably a network timeout indicates
* that a node is broken and should be recovered. They can't both recover each
* other and both carry on without serialising their access to the file system.
* They need to decide who is authoritative. Now extend that problem to
* arbitrary groups of nodes losing connectivity between each other.
*
* So we declare that a node which has given up on connecting to a majority
* of nodes who are still heartbeating will fence itself.
*
* There are huge opportunities for races here. After we give up on a node's
* connection we need to wait long enough to give heartbeat an opportunity
* to declare the node as truly dead. We also need to be careful with the
* race between when we see a node start heartbeating and when we connect
* to it.
*
* So nodes that are in this transtion put a hold on the quorum decision
* with a counter. As they fall out of this transition they drop the count
* and if they're the last, they fire off the decision.
*/
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/reboot.h>
#include "heartbeat.h"
#include "nodemanager.h"
#define MLOG_MASK_PREFIX ML_QUORUM
#include "masklog.h"
#include "quorum.h"
static struct o2quo_state {
spinlock_t qs_lock;
struct work_struct qs_work;
int qs_pending;
int qs_heartbeating;
unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
int qs_connected;
unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
int qs_holds;
unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
} o2quo_state;
/* this is horribly heavy-handed. It should instead flip the file
* system RO and call some userspace script. */
static void o2quo_fence_self(void)
{
/* panic spins with interrupts enabled. with preempt
* threads can still schedule, etc, etc */
o2hb_stop_all_regions();
switch (o2nm_single_cluster->cl_fence_method) {
case O2NM_FENCE_PANIC:
panic("*** ocfs2 is very sorry to be fencing this system by "
"panicing ***\n");
break;
default:
WARN_ON(o2nm_single_cluster->cl_fence_method >=
O2NM_FENCE_METHODS);
case O2NM_FENCE_RESET:
printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
"system by restarting ***\n");
emergency_restart();
break;
};
}
/* Indicate that a timeout occurred on a hearbeat region write. The
* other nodes in the cluster may consider us dead at that time so we
* want to "fence" ourselves so that we don't scribble on the disk
* after they think they've recovered us. This can't solve all
* problems related to writeout after recovery but this hack can at
* least close some of those gaps. When we have real fencing, this can
* go away as our node would be fenced externally before other nodes
* begin recovery. */
void o2quo_disk_timeout(void)
{
o2quo_fence_self();
}
static void o2quo_make_decision(struct work_struct *work)
{
int quorum;
int lowest_hb, lowest_reachable = 0, fence = 0;
struct o2quo_state *qs = &o2quo_state;
spin_lock(&qs->qs_lock);
lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
if (lowest_hb != O2NM_MAX_NODES)
lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
mlog(0, "heartbeating: %d, connected: %d, "
"lowest: %d (%sreachable)\n", qs->qs_heartbeating,
qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
qs->qs_heartbeating == 1)
goto out;
if (qs->qs_heartbeating & 1) {
/* the odd numbered cluster case is straight forward --
* if we can't talk to the majority we're hosed */
quorum = (qs->qs_heartbeating + 1)/2;
if (qs->qs_connected < quorum) {
mlog(ML_ERROR, "fencing this node because it is "
"only connected to %u nodes and %u is needed "
"to make a quorum out of %u heartbeating nodes\n",
qs->qs_connected, quorum,
qs->qs_heartbeating);
fence = 1;
}
} else {
/* the even numbered cluster adds the possibility of each half
* of the cluster being able to talk amongst themselves.. in
* that case we're hosed if we can't talk to the group that has
* the lowest numbered node */
quorum = qs->qs_heartbeating / 2;
if (qs->qs_connected < quorum) {
mlog(ML_ERROR, "fencing this node because it is "
"only connected to %u nodes and %u is needed "
"to make a quorum out of %u heartbeating nodes\n",
qs->qs_connected, quorum,
qs->qs_heartbeating);
fence = 1;
}
else if ((qs->qs_connected == quorum) &&
!lowest_reachable) {
mlog(ML_ERROR, "fencing this node because it is "
"connected to a half-quorum of %u out of %u "
"nodes which doesn't include the lowest active "
"node %u\n", quorum, qs->qs_heartbeating,
lowest_hb);
fence = 1;
}
}
out:
if (fence) {
spin_unlock(&qs->qs_lock);
o2quo_fence_self();
} else {
mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
"connected: %d, lowest: %d (%sreachable)\n",
qs->qs_heartbeating, qs->qs_connected, lowest_hb,
lowest_reachable ? "" : "un");
spin_unlock(&qs->qs_lock);
}
}
static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
{
assert_spin_locked(&qs->qs_lock);
if (!test_and_set_bit(node, qs->qs_hold_bm)) {
qs->qs_holds++;
mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
"node %u\n", node);
mlog(0, "node %u, %d total\n", node, qs->qs_holds);
}
}
static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
{
assert_spin_locked(&qs->qs_lock);
if (test_and_clear_bit(node, qs->qs_hold_bm)) {
mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
if (--qs->qs_holds == 0) {
if (qs->qs_pending) {
qs->qs_pending = 0;
schedule_work(&qs->qs_work);
}
}
mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
node, qs->qs_holds);
}
}
/* as a node comes up we delay the quorum decision until we know the fate of
* the connection. the hold will be droped in conn_up or hb_down. it might be
* perpetuated by con_err until hb_down. if we already have a conn, we might
* be dropping a hold that conn_up got. */
void o2quo_hb_up(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
spin_lock(&qs->qs_lock);
qs->qs_heartbeating++;
mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
"node %u\n", node);
mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
set_bit(node, qs->qs_hb_bm);
mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
if (!test_bit(node, qs->qs_conn_bm))
o2quo_set_hold(qs, node);
else
o2quo_clear_hold(qs, node);
spin_unlock(&qs->qs_lock);
}
/* hb going down releases any holds we might have had due to this node from
* conn_up, conn_err, or hb_up */
void o2quo_hb_down(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
spin_lock(&qs->qs_lock);
qs->qs_heartbeating--;
mlog_bug_on_msg(qs->qs_heartbeating < 0,
"node %u, %d heartbeating\n",
node, qs->qs_heartbeating);
mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
clear_bit(node, qs->qs_hb_bm);
mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
o2quo_clear_hold(qs, node);
spin_unlock(&qs->qs_lock);
}
/* this tells us that we've decided that the node is still heartbeating
* even though we've lost it's conn. it must only be called after conn_err
* and indicates that we must now make a quorum decision in the future,
* though we might be doing so after waiting for holds to drain. Here
* we'll be dropping the hold from conn_err. */
void o2quo_hb_still_up(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
spin_lock(&qs->qs_lock);
mlog(0, "node %u\n", node);
qs->qs_pending = 1;
o2quo_clear_hold(qs, node);
spin_unlock(&qs->qs_lock);
}
/* This is analogous to hb_up. as a node's connection comes up we delay the
* quorum decision until we see it heartbeating. the hold will be droped in
* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
* it's already heartbeating we might be dropping a hold that conn_up got.
* */
void o2quo_conn_up(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
spin_lock(&qs->qs_lock);
qs->qs_connected++;
mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
"node %u\n", node);
mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
set_bit(node, qs->qs_conn_bm);
mlog(0, "node %u, %d total\n", node, qs->qs_connected);
if (!test_bit(node, qs->qs_hb_bm))
o2quo_set_hold(qs, node);
else
o2quo_clear_hold(qs, node);
spin_unlock(&qs->qs_lock);
}
/* we've decided that we won't ever be connecting to the node again. if it's
* still heartbeating we grab a hold that will delay decisions until either the
* node stops heartbeating from hb_down or the caller decides that the node is
* still up and calls still_up */
void o2quo_conn_err(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
spin_lock(&qs->qs_lock);
if (test_bit(node, qs->qs_conn_bm)) {
qs->qs_connected--;
mlog_bug_on_msg(qs->qs_connected < 0,
"node %u, connected %d\n",
node, qs->qs_connected);
clear_bit(node, qs->qs_conn_bm);
}
mlog(0, "node %u, %d total\n", node, qs->qs_connected);
if (test_bit(node, qs->qs_hb_bm))
o2quo_set_hold(qs, node);
spin_unlock(&qs->qs_lock);
}
void o2quo_init(void)
{
struct o2quo_state *qs = &o2quo_state;
spin_lock_init(&qs->qs_lock);
INIT_WORK(&qs->qs_work, o2quo_make_decision);
}
void o2quo_exit(void)
{
struct o2quo_state *qs = &o2quo_state;
flush_work(&qs->qs_work);
}

36
fs/ocfs2/cluster/quorum.h Normal file
View file

@ -0,0 +1,36 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef O2CLUSTER_QUORUM_H
#define O2CLUSTER_QUORUM_H
void o2quo_init(void);
void o2quo_exit(void);
void o2quo_hb_up(u8 node);
void o2quo_hb_down(u8 node);
void o2quo_hb_still_up(u8 node);
void o2quo_conn_up(u8 node);
void o2quo_conn_err(u8 node);
void o2quo_disk_timeout(void);
#endif /* O2CLUSTER_QUORUM_H */

82
fs/ocfs2/cluster/sys.c Normal file
View file

@ -0,0 +1,82 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* sys.c
*
* OCFS2 cluster sysfs interface
*
* Copyright (C) 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation,
* version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <linux/fs.h>
#include "ocfs2_nodemanager.h"
#include "masklog.h"
#include "sys.h"
static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
}
static struct kobj_attribute attr_version =
__ATTR(interface_revision, S_IRUGO, version_show, NULL);
static struct attribute *o2cb_attrs[] = {
&attr_version.attr,
NULL,
};
static struct attribute_group o2cb_attr_group = {
.attrs = o2cb_attrs,
};
static struct kset *o2cb_kset;
void o2cb_sys_shutdown(void)
{
mlog_sys_shutdown();
kset_unregister(o2cb_kset);
}
int o2cb_sys_init(void)
{
int ret;
o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj);
if (!o2cb_kset)
return -ENOMEM;
ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
if (ret)
goto error;
ret = mlog_sys_init(o2cb_kset);
if (ret)
goto error;
return 0;
error:
kset_unregister(o2cb_kset);
return ret;
}

33
fs/ocfs2/cluster/sys.h Normal file
View file

@ -0,0 +1,33 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* sys.h
*
* Function prototypes for o2cb sysfs interface
*
* Copyright (C) 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation,
* version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef O2CLUSTER_SYS_H
#define O2CLUSTER_SYS_H
void o2cb_sys_shutdown(void);
int o2cb_sys_init(void);
#endif /* O2CLUSTER_SYS_H */

2218
fs/ocfs2/cluster/tcp.c Normal file

File diff suppressed because it is too large Load diff

155
fs/ocfs2/cluster/tcp.h Normal file
View file

@ -0,0 +1,155 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* tcp.h
*
* Function prototypes
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef O2CLUSTER_TCP_H
#define O2CLUSTER_TCP_H
#include <linux/socket.h>
#ifdef __KERNEL__
#include <net/sock.h>
#include <linux/tcp.h>
#else
#include <sys/socket.h>
#endif
#include <linux/inet.h>
#include <linux/in.h>
struct o2net_msg
{
__be16 magic;
__be16 data_len;
__be16 msg_type;
__be16 pad1;
__be32 sys_status;
__be32 status;
__be32 key;
__be32 msg_num;
__u8 buf[0];
};
typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
typedef void (o2net_post_msg_handler_func)(int status, void *data,
void *ret_data);
#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg))
/* same as hb delay, we're waiting for another node to recognize our hb */
#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000
#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
#define O2NET_TCP_USER_TIMEOUT 0x7fffffff
/* TODO: figure this out.... */
static inline int o2net_link_down(int err, struct socket *sock)
{
if (sock) {
if (sock->sk->sk_state != TCP_ESTABLISHED &&
sock->sk->sk_state != TCP_CLOSE_WAIT)
return 1;
}
if (err >= 0)
return 0;
switch (err) {
/* ????????????????????????? */
case -ERESTARTSYS:
case -EBADF:
/* When the server has died, an ICMP port unreachable
* message prompts ECONNREFUSED. */
case -ECONNREFUSED:
case -ENOTCONN:
case -ECONNRESET:
case -EPIPE:
return 1;
}
return 0;
}
enum {
O2NET_DRIVER_UNINITED,
O2NET_DRIVER_READY,
};
int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
u8 target_node, int *status);
int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
size_t veclen, u8 target_node, int *status);
int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
o2net_msg_handler_func *func, void *data,
o2net_post_msg_handler_func *post_func,
struct list_head *unreg_list);
void o2net_unregister_handler_list(struct list_head *list);
void o2net_fill_node_map(unsigned long *map, unsigned bytes);
struct o2nm_node;
int o2net_register_hb_callbacks(void);
void o2net_unregister_hb_callbacks(void);
int o2net_start_listening(struct o2nm_node *node);
void o2net_stop_listening(struct o2nm_node *node);
void o2net_disconnect_node(struct o2nm_node *node);
int o2net_num_connected_peers(void);
int o2net_init(void);
void o2net_exit(void);
struct o2net_send_tracking;
struct o2net_sock_container;
#ifdef CONFIG_DEBUG_FS
int o2net_debugfs_init(void);
void o2net_debugfs_exit(void);
void o2net_debug_add_nst(struct o2net_send_tracking *nst);
void o2net_debug_del_nst(struct o2net_send_tracking *nst);
void o2net_debug_add_sc(struct o2net_sock_container *sc);
void o2net_debug_del_sc(struct o2net_sock_container *sc);
#else
static inline int o2net_debugfs_init(void)
{
return 0;
}
static inline void o2net_debugfs_exit(void)
{
}
static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst)
{
}
static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst)
{
}
static inline void o2net_debug_add_sc(struct o2net_sock_container *sc)
{
}
static inline void o2net_debug_del_sc(struct o2net_sock_container *sc)
{
}
#endif /* CONFIG_DEBUG_FS */
#endif /* O2CLUSTER_TCP_H */

View file

@ -0,0 +1,242 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* Copyright (C) 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef O2CLUSTER_TCP_INTERNAL_H
#define O2CLUSTER_TCP_INTERNAL_H
#define O2NET_MSG_MAGIC ((u16)0xfa55)
#define O2NET_MSG_STATUS_MAGIC ((u16)0xfa56)
#define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57)
#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
/* we're delaying our quorum decision so that heartbeat will have timed
* out truly dead nodes by the time we come around to making decisions
* on their number */
#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
/*
* This version number represents quite a lot, unfortunately. It not
* only represents the raw network message protocol on the wire but also
* locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
* With version 11, we separate out the filesystem locking portion. The
* filesystem now has a major.minor version it negotiates. Version 11
* introduces this negotiation to the o2dlm protocol, and as such the
* version here in tcp_internal.h should not need to be bumped for
* filesystem locking changes.
*
* New in version 11
* - Negotiation of filesystem locking in the dlm join.
*
* New in version 10:
* - Meta/data locks combined
*
* New in version 9:
* - All votes removed
*
* New in version 8:
* - Replace delete inode votes with a cluster lock
*
* New in version 7:
* - DLM join domain includes the live nodemap
*
* New in version 6:
* - DLM lockres remote refcount fixes.
*
* New in version 5:
* - Network timeout checking protocol
*
* New in version 4:
* - Remove i_generation from lock names for better stat performance.
*
* New in version 3:
* - Replace dentry votes with a cluster lock
*
* New in version 2:
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
#define O2NET_PROTOCOL_VERSION 11ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
__be32 o2hb_heartbeat_timeout_ms;
__be32 o2net_idle_timeout_ms;
__be32 o2net_keepalive_delay_ms;
__be32 o2net_reconnect_delay_ms;
};
struct o2net_node {
/* this is never called from int/bh */
spinlock_t nn_lock;
/* set the moment an sc is allocated and a connect is started */
struct o2net_sock_container *nn_sc;
/* _valid is only set after the handshake passes and tx can happen */
unsigned nn_sc_valid:1;
/* if this is set tx just returns it */
int nn_persistent_error;
/* It is only set to 1 after the idle time out. */
atomic_t nn_timeout;
/* threads waiting for an sc to arrive wait on the wq for generation
* to increase. it is increased when a connecting socket succeeds
* or fails or when an accepted socket is attached. */
wait_queue_head_t nn_sc_wq;
struct idr nn_status_idr;
struct list_head nn_status_list;
/* connects are attempted from when heartbeat comes up until either hb
* goes down, the node is unconfigured, no connect attempts succeed
* before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work
* is queued from set_nn_state both from hb up and from itself if a
* connect attempt fails and so can be self-arming. shutdown is
* careful to first mark the nn such that no connects will be attempted
* before canceling delayed connect work and flushing the queue. */
struct delayed_work nn_connect_work;
unsigned long nn_last_connect_attempt;
/* this is queued as nodes come up and is canceled when a connection is
* established. this expiring gives up on the node and errors out
* transmits */
struct delayed_work nn_connect_expired;
/* after we give up on a socket we wait a while before deciding
* that it is still heartbeating and that we should do some
* quorum work */
struct delayed_work nn_still_up;
};
struct o2net_sock_container {
struct kref sc_kref;
/* the next two are valid for the life time of the sc */
struct socket *sc_sock;
struct o2nm_node *sc_node;
/* all of these sc work structs hold refs on the sc while they are
* queued. they should not be able to ref a freed sc. the teardown
* race is with o2net_wq destruction in o2net_stop_listening() */
/* rx and connect work are generated from socket callbacks. sc
* shutdown removes the callbacks and then flushes the work queue */
struct work_struct sc_rx_work;
struct work_struct sc_connect_work;
/* shutdown work is triggered in two ways. the simple way is
* for a code path calls ensure_shutdown which gets a lock, removes
* the sc from the nn, and queues the work. in this case the
* work is single-shot. the work is also queued from a sock
* callback, though, and in this case the work will find the sc
* still on the nn and will call ensure_shutdown itself.. this
* ends up triggering the shutdown work again, though nothing
* will be done in that second iteration. so work queue teardown
* has to be careful to remove the sc from the nn before waiting
* on the work queue so that the shutdown work doesn't remove the
* sc and rearm itself.
*/
struct work_struct sc_shutdown_work;
struct timer_list sc_idle_timeout;
struct delayed_work sc_keepalive_work;
unsigned sc_handshake_ok:1;
struct page *sc_page;
size_t sc_page_off;
/* original handlers for the sockets */
void (*sc_state_change)(struct sock *sk);
void (*sc_data_ready)(struct sock *sk);
u32 sc_msg_key;
u16 sc_msg_type;
#ifdef CONFIG_DEBUG_FS
struct list_head sc_net_debug_item;
ktime_t sc_tv_timer;
ktime_t sc_tv_data_ready;
ktime_t sc_tv_advance_start;
ktime_t sc_tv_advance_stop;
ktime_t sc_tv_func_start;
ktime_t sc_tv_func_stop;
#endif
#ifdef CONFIG_OCFS2_FS_STATS
ktime_t sc_tv_acquiry_total;
ktime_t sc_tv_send_total;
ktime_t sc_tv_status_total;
u32 sc_send_count;
u32 sc_recv_count;
ktime_t sc_tv_process_total;
#endif
struct mutex sc_send_lock;
};
struct o2net_msg_handler {
struct rb_node nh_node;
u32 nh_max_len;
u32 nh_msg_type;
u32 nh_key;
o2net_msg_handler_func *nh_func;
o2net_msg_handler_func *nh_func_data;
o2net_post_msg_handler_func
*nh_post_func;
struct kref nh_kref;
struct list_head nh_unregister_item;
};
enum o2net_system_error {
O2NET_ERR_NONE = 0,
O2NET_ERR_NO_HNDLR,
O2NET_ERR_OVERFLOW,
O2NET_ERR_DIED,
O2NET_ERR_MAX
};
struct o2net_status_wait {
enum o2net_system_error ns_sys_status;
s32 ns_status;
int ns_id;
wait_queue_head_t ns_wq;
struct list_head ns_node_item;
};
#ifdef CONFIG_DEBUG_FS
/* just for state dumps */
struct o2net_send_tracking {
struct list_head st_net_debug_item;
struct task_struct *st_task;
struct o2net_sock_container *st_sc;
u32 st_id;
u32 st_msg_type;
u32 st_msg_key;
u8 st_node;
ktime_t st_sock_time;
ktime_t st_send_time;
ktime_t st_status_time;
};
#else
struct o2net_send_tracking {
u32 dummy;
};
#endif /* CONFIG_DEBUG_FS */
#endif /* O2CLUSTER_TCP_INTERNAL_H */

476
fs/ocfs2/dcache.c Normal file
View file

@ -0,0 +1,476 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dcache.c
*
* dentry cache handling code
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dcache.h"
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
#include "ocfs2_trace.h"
void ocfs2_dentry_attach_gen(struct dentry *dentry)
{
unsigned long gen =
OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
BUG_ON(dentry->d_inode);
dentry->d_fsdata = (void *)gen;
}
static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
int ret = 0; /* if all else fails, just return false */
struct ocfs2_super *osb;
if (flags & LOOKUP_RCU)
return -ECHILD;
inode = dentry->d_inode;
osb = OCFS2_SB(dentry->d_sb);
trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
dentry->d_name.name);
/* For a negative dentry -
* check the generation number of the parent and compare with the
* one stored in the inode.
*/
if (inode == NULL) {
unsigned long gen = (unsigned long) dentry->d_fsdata;
unsigned long pgen;
spin_lock(&dentry->d_lock);
pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
spin_unlock(&dentry->d_lock);
trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
dentry->d_name.name,
pgen, gen);
if (gen != pgen)
goto bail;
goto valid;
}
BUG_ON(!osb);
if (inode == osb->root_inode || is_bad_inode(inode))
goto bail;
spin_lock(&OCFS2_I(inode)->ip_lock);
/* did we or someone else delete this inode? */
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
trace_ocfs2_dentry_revalidate_delete(
(unsigned long long)OCFS2_I(inode)->ip_blkno);
goto bail;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
/*
* We don't need a cluster lock to test this because once an
* inode nlink hits zero, it never goes back.
*/
if (inode->i_nlink == 0) {
trace_ocfs2_dentry_revalidate_orphaned(
(unsigned long long)OCFS2_I(inode)->ip_blkno,
S_ISDIR(inode->i_mode));
goto bail;
}
/*
* If the last lookup failed to create dentry lock, let us
* redo it.
*/
if (!dentry->d_fsdata) {
trace_ocfs2_dentry_revalidate_nofsdata(
(unsigned long long)OCFS2_I(inode)->ip_blkno);
goto bail;
}
valid:
ret = 1;
bail:
trace_ocfs2_dentry_revalidate_ret(ret);
return ret;
}
static int ocfs2_match_dentry(struct dentry *dentry,
u64 parent_blkno,
int skip_unhashed)
{
struct inode *parent;
/*
* ocfs2_lookup() does a d_splice_alias() _before_ attaching
* to the lock data, so we skip those here, otherwise
* ocfs2_dentry_attach_lock() will get its original dentry
* back.
*/
if (!dentry->d_fsdata)
return 0;
if (!dentry->d_parent)
return 0;
if (skip_unhashed && d_unhashed(dentry))
return 0;
parent = dentry->d_parent->d_inode;
/* Negative parent dentry? */
if (!parent)
return 0;
/* Name is in a different directory. */
if (OCFS2_I(parent)->ip_blkno != parent_blkno)
return 0;
return 1;
}
/*
* Walk the inode alias list, and find a dentry which has a given
* parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
* is looking for a dentry_lock reference. The downconvert thread is
* looking to unhash aliases, so we allow it to skip any that already
* have that property.
*/
struct dentry *ocfs2_find_local_alias(struct inode *inode,
u64 parent_blkno,
int skip_unhashed)
{
struct dentry *dentry;
spin_lock(&inode->i_lock);
hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
spin_lock(&dentry->d_lock);
if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
trace_ocfs2_find_local_alias(dentry->d_name.len,
dentry->d_name.name);
dget_dlock(dentry);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
return dentry;
}
spin_unlock(&dentry->d_lock);
}
spin_unlock(&inode->i_lock);
return NULL;
}
DEFINE_SPINLOCK(dentry_attach_lock);
/*
* Attach this dentry to a cluster lock.
*
* Dentry locks cover all links in a given directory to a particular
* inode. We do this so that ocfs2 can build a lock name which all
* nodes in the cluster can agree on at all times. Shoving full names
* in the cluster lock won't work due to size restrictions. Covering
* links inside of a directory is a good compromise because it still
* allows us to use the parent directory lock to synchronize
* operations.
*
* Call this function with the parent dir semaphore and the parent dir
* cluster lock held.
*
* The dir semaphore will protect us from having to worry about
* concurrent processes on our node trying to attach a lock at the
* same time.
*
* The dir cluster lock (held at either PR or EX mode) protects us
* from unlink and rename on other nodes.
*
* A dput() can happen asynchronously due to pruning, so we cover
* attaching and detaching the dentry lock with a
* dentry_attach_lock.
*
* A node which has done lookup on a name retains a protected read
* lock until final dput. If the user requests and unlink or rename,
* the protected read is upgraded to an exclusive lock. Other nodes
* who have seen the dentry will then be informed that they need to
* downgrade their lock, which will involve d_delete on the
* dentry. This happens in ocfs2_dentry_convert_worker().
*/
int ocfs2_dentry_attach_lock(struct dentry *dentry,
struct inode *inode,
u64 parent_blkno)
{
int ret;
struct dentry *alias;
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
(unsigned long long)parent_blkno, dl);
/*
* Negative dentry. We ignore these for now.
*
* XXX: Could we can improve ocfs2_dentry_revalidate() by
* tracking these?
*/
if (!inode)
return 0;
if (!dentry->d_inode && dentry->d_fsdata) {
/* Converting a negative dentry to positive
Clear dentry->d_fsdata */
dentry->d_fsdata = dl = NULL;
}
if (dl) {
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
" \"%.*s\": old parent: %llu, new: %llu\n",
dentry->d_name.len, dentry->d_name.name,
(unsigned long long)parent_blkno,
(unsigned long long)dl->dl_parent_blkno);
return 0;
}
alias = ocfs2_find_local_alias(inode, parent_blkno, 0);
if (alias) {
/*
* Great, an alias exists, which means we must have a
* dentry lock already. We can just grab the lock off
* the alias and add it to the list.
*
* We're depending here on the fact that this dentry
* was found and exists in the dcache and so must have
* a reference to the dentry_lock because we can't
* race creates. Final dput() cannot happen on it
* since we have it pinned, so our reference is safe.
*/
dl = alias->d_fsdata;
mlog_bug_on_msg(!dl, "parent %llu, ino %llu\n",
(unsigned long long)parent_blkno,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
" \"%.*s\": old parent: %llu, new: %llu\n",
dentry->d_name.len, dentry->d_name.name,
(unsigned long long)parent_blkno,
(unsigned long long)dl->dl_parent_blkno);
trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name,
(unsigned long long)parent_blkno,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
goto out_attach;
}
/*
* There are no other aliases
*/
dl = kmalloc(sizeof(*dl), GFP_NOFS);
if (!dl) {
ret = -ENOMEM;
mlog_errno(ret);
return ret;
}
dl->dl_count = 0;
/*
* Does this have to happen below, for all attaches, in case
* the struct inode gets blown away by the downconvert thread?
*/
dl->dl_inode = igrab(inode);
dl->dl_parent_blkno = parent_blkno;
ocfs2_dentry_lock_res_init(dl, parent_blkno, inode);
out_attach:
spin_lock(&dentry_attach_lock);
dentry->d_fsdata = dl;
dl->dl_count++;
spin_unlock(&dentry_attach_lock);
/*
* This actually gets us our PRMODE level lock. From now on,
* we'll have a notification if one of these names is
* destroyed on another node.
*/
ret = ocfs2_dentry_lock(dentry, 0);
if (!ret)
ocfs2_dentry_unlock(dentry, 0);
else
mlog_errno(ret);
/*
* In case of error, manually free the allocation and do the iput().
* We need to do this because error here means no d_instantiate(),
* which means iput() will not be called during dput(dentry).
*/
if (ret < 0 && !alias) {
ocfs2_lock_res_free(&dl->dl_lockres);
BUG_ON(dl->dl_count != 1);
spin_lock(&dentry_attach_lock);
dentry->d_fsdata = NULL;
spin_unlock(&dentry_attach_lock);
kfree(dl);
iput(inode);
}
dput(alias);
return ret;
}
/*
* ocfs2_dentry_iput() and friends.
*
* At this point, our particular dentry is detached from the inodes
* alias list, so there's no way that the locking code can find it.
*
* The interesting stuff happens when we determine that our lock needs
* to go away because this is the last subdir alias in the
* system. This function needs to handle a couple things:
*
* 1) Synchronizing lock shutdown with the downconvert threads. This
* is already handled for us via the lockres release drop function
* called in ocfs2_release_dentry_lock()
*
* 2) A race may occur when we're doing our lock shutdown and
* another process wants to create a new dentry lock. Right now we
* let them race, which means that for a very short while, this
* node might have two locks on a lock resource. This should be a
* problem though because one of them is in the process of being
* thrown out.
*/
static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
iput(dl->dl_inode);
ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
ocfs2_lock_res_free(&dl->dl_lockres);
kfree(dl);
}
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
int unlock = 0;
BUG_ON(dl->dl_count == 0);
spin_lock(&dentry_attach_lock);
dl->dl_count--;
unlock = !dl->dl_count;
spin_unlock(&dentry_attach_lock);
if (unlock)
ocfs2_drop_dentry_lock(osb, dl);
}
static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
{
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
if (!dl) {
/*
* No dentry lock is ok if we're disconnected or
* unhashed.
*/
if (!(dentry->d_flags & DCACHE_DISCONNECTED) &&
!d_unhashed(dentry)) {
unsigned long long ino = 0ULL;
if (inode)
ino = (unsigned long long)OCFS2_I(inode)->ip_blkno;
mlog(ML_ERROR, "Dentry is missing cluster lock. "
"inode: %llu, d_flags: 0x%x, d_name: %.*s\n",
ino, dentry->d_flags, dentry->d_name.len,
dentry->d_name.name);
}
goto out;
}
mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n",
dentry->d_name.len, dentry->d_name.name,
dl->dl_count);
ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl);
out:
iput(inode);
}
/*
* d_move(), but keep the locks in sync.
*
* When we are done, "dentry" will have the parent dir and name of
* "target", which will be thrown away.
*
* We manually update the lock of "dentry" if need be.
*
* "target" doesn't have it's dentry lock touched - we allow the later
* dput() to handle this for us.
*
* This is called during ocfs2_rename(), while holding parent
* directory locks. The dentries have already been deleted on other
* nodes via ocfs2_remote_dentry_delete().
*
* Normally, the VFS handles the d_move() for the file system, after
* the ->rename() callback. OCFS2 wants to handle this internally, so
* the new lock can be created atomically with respect to the cluster.
*/
void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
struct inode *old_dir, struct inode *new_dir)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb);
struct inode *inode = dentry->d_inode;
/*
* Move within the same directory, so the actual lock info won't
* change.
*
* XXX: Is there any advantage to dropping the lock here?
*/
if (old_dir == new_dir)
goto out_move;
ocfs2_dentry_lock_put(osb, dentry->d_fsdata);
dentry->d_fsdata = NULL;
ret = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(new_dir)->ip_blkno);
if (ret)
mlog_errno(ret);
out_move:
d_move(dentry, target);
}
const struct dentry_operations ocfs2_dentry_ops = {
.d_revalidate = ocfs2_dentry_revalidate,
.d_iput = ocfs2_dentry_iput,
};

59
fs/ocfs2/dcache.h Normal file
View file

@ -0,0 +1,59 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dcache.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_DCACHE_H
#define OCFS2_DCACHE_H
extern const struct dentry_operations ocfs2_dentry_ops;
struct ocfs2_dentry_lock {
unsigned int dl_count;
u64 dl_parent_blkno;
/*
* The ocfs2_dentry_lock keeps an inode reference until
* dl_lockres has been destroyed. This is usually done in
* ->d_iput() anyway, so there should be minimal impact.
*/
struct inode *dl_inode;
struct ocfs2_lock_res dl_lockres;
};
int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
u64 parent_blkno);
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl);
struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
int skip_unhashed);
void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
struct inode *old_dir, struct inode *new_dir);
extern spinlock_t dentry_attach_lock;
void ocfs2_dentry_attach_gen(struct dentry *dentry);
#endif /* OCFS2_DCACHE_H */

4506
fs/ocfs2/dir.c Normal file

File diff suppressed because it is too large Load diff

116
fs/ocfs2/dir.h Normal file
View file

@ -0,0 +1,116 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dir.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_DIR_H
#define OCFS2_DIR_H
struct ocfs2_dx_hinfo {
u32 major_hash;
u32 minor_hash;
};
struct ocfs2_dir_lookup_result {
struct buffer_head *dl_leaf_bh; /* Unindexed leaf
* block */
struct ocfs2_dir_entry *dl_entry; /* Target dirent in
* unindexed leaf */
struct buffer_head *dl_dx_root_bh; /* Root of indexed
* tree */
struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */
struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in
* indexed leaf */
struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */
struct buffer_head *dl_prev_leaf_bh;/* Previous entry in
* dir free space
* list. NULL if
* previous entry is
* dx root block. */
};
void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
int ocfs2_find_entry(const char *name, int namelen,
struct inode *dir,
struct ocfs2_dir_lookup_result *lookup);
int ocfs2_delete_entry(handle_t *handle,
struct inode *dir,
struct ocfs2_dir_lookup_result *res);
int __ocfs2_add_entry(handle_t *handle,
struct inode *dir,
const char *name, int namelen,
struct inode *inode, u64 blkno,
struct buffer_head *parent_fe_bh,
struct ocfs2_dir_lookup_result *lookup);
static inline int ocfs2_add_entry(handle_t *handle,
struct dentry *dentry,
struct inode *inode, u64 blkno,
struct buffer_head *parent_fe_bh,
struct ocfs2_dir_lookup_result *lookup)
{
return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
dentry->d_name.name, dentry->d_name.len,
inode, blkno, parent_fe_bh, lookup);
}
int ocfs2_update_entry(struct inode *dir, handle_t *handle,
struct ocfs2_dir_lookup_result *res,
struct inode *new_entry_inode);
int ocfs2_check_dir_for_entry(struct inode *dir,
const char *name,
int namelen);
int ocfs2_empty_dir(struct inode *inode);
int ocfs2_find_files_on_disk(const char *name,
int namelen,
u64 *blkno,
struct inode *inode,
struct ocfs2_dir_lookup_result *res);
int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
int namelen, u64 *blkno);
int ocfs2_readdir(struct file *file, struct dir_context *ctx);
int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
const char *name,
int namelen,
struct ocfs2_dir_lookup_result *lookup);
struct ocfs2_alloc_context;
int ocfs2_fill_new_dir(struct ocfs2_super *osb,
handle_t *handle,
struct inode *parent,
struct inode *inode,
struct buffer_head *fe_bh,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac);
int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
void *data);
#endif /* OCFS2_DIR_H */

7
fs/ocfs2/dlm/Makefile Normal file
View file

@ -0,0 +1,7 @@
ccflags-y := -Ifs/ocfs2
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o

220
fs/ocfs2/dlm/dlmapi.h Normal file
View file

@ -0,0 +1,220 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmapi.h
*
* externally exported dlm interfaces
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef DLMAPI_H
#define DLMAPI_H
struct dlm_lock;
struct dlm_ctxt;
/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
enum dlm_status {
DLM_NORMAL = 0, /* 0: request in progress */
DLM_GRANTED, /* 1: request granted */
DLM_DENIED, /* 2: request denied */
DLM_DENIED_NOLOCKS, /* 3: request denied, out of system resources */
DLM_WORKING, /* 4: async request in progress */
DLM_BLOCKED, /* 5: lock request blocked */
DLM_BLOCKED_ORPHAN, /* 6: lock request blocked by a orphan lock*/
DLM_DENIED_GRACE_PERIOD, /* 7: topological change in progress */
DLM_SYSERR, /* 8: system error */
DLM_NOSUPPORT, /* 9: unsupported */
DLM_CANCELGRANT, /* 10: can't cancel convert: already granted */
DLM_IVLOCKID, /* 11: bad lockid */
DLM_SYNC, /* 12: synchronous request granted */
DLM_BADTYPE, /* 13: bad resource type */
DLM_BADRESOURCE, /* 14: bad resource handle */
DLM_MAXHANDLES, /* 15: no more resource handles */
DLM_NOCLINFO, /* 16: can't contact cluster manager */
DLM_NOLOCKMGR, /* 17: can't contact lock manager */
DLM_NOPURGED, /* 18: can't contact purge daemon */
DLM_BADARGS, /* 19: bad api args */
DLM_VOID, /* 20: no status */
DLM_NOTQUEUED, /* 21: NOQUEUE was specified and request failed */
DLM_IVBUFLEN, /* 22: invalid resource name length */
DLM_CVTUNGRANT, /* 23: attempted to convert ungranted lock */
DLM_BADPARAM, /* 24: invalid lock mode specified */
DLM_VALNOTVALID, /* 25: value block has been invalidated */
DLM_REJECTED, /* 26: request rejected, unrecognized client */
DLM_ABORT, /* 27: blocked lock request cancelled */
DLM_CANCEL, /* 28: conversion request cancelled */
DLM_IVRESHANDLE, /* 29: invalid resource handle */
DLM_DEADLOCK, /* 30: deadlock recovery refused this request */
DLM_DENIED_NOASTS, /* 31: failed to allocate AST */
DLM_FORWARD, /* 32: request must wait for primary's response */
DLM_TIMEOUT, /* 33: timeout value for lock has expired */
DLM_IVGROUPID, /* 34: invalid group specification */
DLM_VERS_CONFLICT, /* 35: version conflicts prevent request handling */
DLM_BAD_DEVICE_PATH, /* 36: Locks device does not exist or path wrong */
DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
DLM_NO_CONTROL_DEVICE, /* 38: Cannot set options on opened device */
DLM_RECOVERING, /* 39: extension, allows caller to fail a lock
request if it is being recovered */
DLM_MIGRATING, /* 40: extension, allows caller to fail a lock
request if it is being migrated */
DLM_MAXSTATS, /* 41: upper limit for return code validation */
};
/* for pretty-printing dlm_status error messages */
const char *dlm_errmsg(enum dlm_status err);
/* for pretty-printing dlm_status error names */
const char *dlm_errname(enum dlm_status err);
/* Eventually the DLM will use standard errno values, but in the
* meantime this lets us track dlm errors as they bubble up. When we
* bring its error reporting into line with the rest of the stack,
* these can just be replaced with calls to mlog_errno. */
#define dlm_error(st) do { \
if ((st) != DLM_RECOVERING && \
(st) != DLM_MIGRATING && \
(st) != DLM_FORWARD) \
mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
} while (0)
#define DLM_LKSB_UNUSED1 0x01
#define DLM_LKSB_PUT_LVB 0x02
#define DLM_LKSB_GET_LVB 0x04
#define DLM_LKSB_UNUSED2 0x08
#define DLM_LKSB_UNUSED3 0x10
#define DLM_LKSB_UNUSED4 0x20
#define DLM_LKSB_UNUSED5 0x40
#define DLM_LKSB_UNUSED6 0x80
#define DLM_LVB_LEN 64
/* Callers are only allowed access to the lvb and status members of
* this struct. */
struct dlm_lockstatus {
enum dlm_status status;
u32 flags;
struct dlm_lock *lockid;
char lvb[DLM_LVB_LEN];
};
/* Valid lock modes. */
#define LKM_IVMODE (-1) /* invalid mode */
#define LKM_NLMODE 0 /* null lock */
#define LKM_CRMODE 1 /* concurrent read unsupported */
#define LKM_CWMODE 2 /* concurrent write unsupported */
#define LKM_PRMODE 3 /* protected read */
#define LKM_PWMODE 4 /* protected write unsupported */
#define LKM_EXMODE 5 /* exclusive */
#define LKM_MAXMODE 5
#define LKM_MODEMASK 0xff
/* Flags passed to dlmlock and dlmunlock:
* reserved: flags used by the "real" dlm
* only a few are supported by this dlm
* (U) = unsupported by ocfs2 dlm */
#define LKM_ORPHAN 0x00000010 /* this lock is orphanable (U) */
#define LKM_PARENTABLE 0x00000020 /* this lock was orphaned (U) */
#define LKM_BLOCK 0x00000040 /* blocking lock request (U) */
#define LKM_LOCAL 0x00000080 /* local lock request */
#define LKM_VALBLK 0x00000100 /* lock value block request */
#define LKM_NOQUEUE 0x00000200 /* non blocking request */
#define LKM_CONVERT 0x00000400 /* conversion request */
#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */
#define LKM_UNLOCK 0x00001000 /* deallocate this lock */
#define LKM_CANCEL 0x00002000 /* cancel conversion request */
#define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */
#define LKM_INVVALBLK 0x00008000 /* invalidate lock value block */
#define LKM_SYNCSTS 0x00010000 /* return synchronous status if poss (U) */
#define LKM_TIMEOUT 0x00020000 /* lock request contains timeout (U) */
#define LKM_SNGLDLCK 0x00040000 /* request can self-deadlock (U) */
#define LKM_FINDLOCAL 0x00080000 /* find local lock request (U) */
#define LKM_PROC_OWNED 0x00100000 /* owned by process, not group (U) */
#define LKM_XID 0x00200000 /* use transaction id for deadlock (U) */
#define LKM_XID_CONFLICT 0x00400000 /* do not allow lock inheritance (U) */
#define LKM_FORCE 0x00800000 /* force unlock flag */
#define LKM_REVVALBLK 0x01000000 /* temporary solution: re-validate
lock value block (U) */
/* unused */
#define LKM_UNUSED1 0x00000001 /* unused */
#define LKM_UNUSED2 0x00000002 /* unused */
#define LKM_UNUSED3 0x00000004 /* unused */
#define LKM_UNUSED4 0x00000008 /* unused */
#define LKM_UNUSED5 0x02000000 /* unused */
#define LKM_UNUSED6 0x04000000 /* unused */
#define LKM_UNUSED7 0x08000000 /* unused */
/* ocfs2 extensions: internal only
* should never be used by caller */
#define LKM_MIGRATION 0x10000000 /* extension: lockres is to be migrated
to another node */
#define LKM_PUT_LVB 0x20000000 /* extension: lvb is being passed
should be applied to lockres */
#define LKM_GET_LVB 0x40000000 /* extension: lvb should be copied
from lockres when lock is granted */
#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock
used to avoid recovery rwsem */
typedef void (dlm_astlockfunc_t)(void *);
typedef void (dlm_bastlockfunc_t)(void *, int);
typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
enum dlm_status dlmlock(struct dlm_ctxt *dlm,
int mode,
struct dlm_lockstatus *lksb,
int flags,
const char *name,
int namelen,
dlm_astlockfunc_t *ast,
void *data,
dlm_bastlockfunc_t *bast);
enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
struct dlm_lockstatus *lksb,
int flags,
dlm_astunlockfunc_t *unlockast,
void *data);
struct dlm_protocol_version {
u8 pv_major;
u8 pv_minor;
};
struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key,
struct dlm_protocol_version *fs_proto);
void dlm_unregister_domain(struct dlm_ctxt *dlm);
void dlm_print_one_lock(struct dlm_lock *lockid);
typedef void (dlm_eviction_func)(int, void *);
struct dlm_eviction_cb {
struct list_head ec_item;
dlm_eviction_func *ec_func;
void *ec_data;
};
void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
dlm_eviction_func *f,
void *data);
void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
struct dlm_eviction_cb *cb);
void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
#endif /* DLMAPI_H */

500
fs/ocfs2/dlm/dlmast.c Normal file
View file

@ -0,0 +1,500 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmast.c
*
* AST and BAST functionality for local and remote nodes
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/spinlock.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock);
static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
/* Should be called as an ast gets queued to see if the new
* lock level will obsolete a pending bast.
* For example, if dlm_thread queued a bast for an EX lock that
* was blocking another EX, but before sending the bast the
* lock owner downconverted to NL, the bast is now obsolete.
* Only the ast should be sent.
* This is needed because the lock and convert paths can queue
* asts out-of-band (not waiting for dlm_thread) in order to
* allow for LKM_NOQUEUE to get immediate responses. */
static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
assert_spin_locked(&dlm->ast_lock);
assert_spin_locked(&lock->spinlock);
if (lock->ml.highest_blocked == LKM_IVMODE)
return 0;
BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
if (lock->bast_pending &&
list_empty(&lock->bast_list))
/* old bast already sent, ok */
return 0;
if (lock->ml.type == LKM_EXMODE)
/* EX blocks anything left, any bast still valid */
return 0;
else if (lock->ml.type == LKM_NLMODE)
/* NL blocks nothing, no reason to send any bast, cancel it */
return 1;
else if (lock->ml.highest_blocked != LKM_EXMODE)
/* PR only blocks EX */
return 1;
return 0;
}
void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
struct dlm_lock_resource *res;
BUG_ON(!dlm);
BUG_ON(!lock);
res = lock->lockres;
assert_spin_locked(&dlm->ast_lock);
if (!list_empty(&lock->ast_list)) {
mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
"AST list not empty, pending %d, newlevel %d\n",
dlm->name, res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
lock->ast_pending, lock->ml.type);
BUG();
}
if (lock->ast_pending)
mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
dlm->name, res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
/* putting lock on list, add a ref */
dlm_lock_get(lock);
spin_lock(&lock->spinlock);
/* check to see if this ast obsoletes the bast */
if (dlm_should_cancel_bast(dlm, lock)) {
mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
dlm->name, res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lock->bast_pending = 0;
list_del_init(&lock->bast_list);
lock->ml.highest_blocked = LKM_IVMODE;
/* removing lock from list, remove a ref. guaranteed
* this won't be the last ref because of the get above,
* so res->spinlock will not be taken here */
dlm_lock_put(lock);
/* free up the reserved bast that we are cancelling.
* guaranteed that this will not be the last reserved
* ast because *both* an ast and a bast were reserved
* to get to this point. the res->spinlock will not be
* taken here */
dlm_lockres_release_ast(dlm, res);
}
list_add_tail(&lock->ast_list, &dlm->pending_asts);
lock->ast_pending = 1;
spin_unlock(&lock->spinlock);
}
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
BUG_ON(!dlm);
BUG_ON(!lock);
spin_lock(&dlm->ast_lock);
__dlm_queue_ast(dlm, lock);
spin_unlock(&dlm->ast_lock);
}
void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
struct dlm_lock_resource *res;
BUG_ON(!dlm);
BUG_ON(!lock);
assert_spin_locked(&dlm->ast_lock);
res = lock->lockres;
BUG_ON(!list_empty(&lock->bast_list));
if (lock->bast_pending)
mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
dlm->name, res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
/* putting lock on list, add a ref */
dlm_lock_get(lock);
spin_lock(&lock->spinlock);
list_add_tail(&lock->bast_list, &dlm->pending_basts);
lock->bast_pending = 1;
spin_unlock(&lock->spinlock);
}
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
BUG_ON(!dlm);
BUG_ON(!lock);
spin_lock(&dlm->ast_lock);
__dlm_queue_bast(dlm, lock);
spin_unlock(&dlm->ast_lock);
}
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
struct dlm_lockstatus *lksb = lock->lksb;
BUG_ON(!lksb);
/* only updates if this node masters the lockres */
spin_lock(&res->spinlock);
if (res->owner == dlm->node_num) {
/* check the lksb flags for the direction */
if (lksb->flags & DLM_LKSB_GET_LVB) {
mlog(0, "getting lvb from lockres for %s node\n",
lock->ml.node == dlm->node_num ? "master" :
"remote");
memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
}
/* Do nothing for lvb put requests - they should be done in
* place when the lock is downconverted - otherwise we risk
* racing gets and puts which could result in old lvb data
* being propagated. We leave the put flag set and clear it
* here. In the future we might want to clear it at the time
* the put is actually done.
*/
}
spin_unlock(&res->spinlock);
/* reset any lvb flags on the lksb */
lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
}
void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
dlm_astlockfunc_t *fn;
struct dlm_lockstatus *lksb;
mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lksb = lock->lksb;
fn = lock->ast;
BUG_ON(lock->ml.node != dlm->node_num);
dlm_update_lvb(dlm, res, lock);
(*fn)(lock->astdata);
}
int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
int ret;
struct dlm_lockstatus *lksb;
int lksbflags;
mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lksb = lock->lksb;
BUG_ON(lock->ml.node == dlm->node_num);
lksbflags = lksb->flags;
dlm_update_lvb(dlm, res, lock);
/* lock request came from another node
* go do the ast over there */
ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
return ret;
}
void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock, int blocked_type)
{
dlm_bastlockfunc_t *fn = lock->bast;
BUG_ON(lock->ml.node != dlm->node_num);
mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
dlm->name, res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
blocked_type);
(*fn)(lock->astdata, blocked_type);
}
int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
int ret;
unsigned int locklen;
struct dlm_ctxt *dlm = data;
struct dlm_lock_resource *res = NULL;
struct dlm_lock *lock = NULL;
struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
char *name;
struct list_head *head = NULL;
__be64 cookie;
u32 flags;
u8 node;
if (!dlm_grab(dlm)) {
dlm_error(DLM_REJECTED);
return DLM_REJECTED;
}
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
name = past->name;
locklen = past->namelen;
cookie = past->cookie;
flags = be32_to_cpu(past->flags);
node = past->node_idx;
if (locklen > DLM_LOCKID_NAME_MAX) {
ret = DLM_IVBUFLEN;
mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
"handler!\n", locklen);
goto leave;
}
if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
(LKM_PUT_LVB|LKM_GET_LVB)) {
mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
flags);
ret = DLM_BADARGS;
goto leave;
}
mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
(flags & LKM_GET_LVB ? "get lvb" : "none"));
mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
if (past->type != DLM_AST &&
past->type != DLM_BAST) {
mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
"name=%.*s, node=%u\n", past->type,
dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
locklen, name, node);
ret = DLM_IVLOCKID;
goto leave;
}
res = dlm_lookup_lockres(dlm, name, locklen);
if (!res) {
mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
"name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
locklen, name, node);
ret = DLM_IVLOCKID;
goto leave;
}
/* cannot get a proxy ast message if this node owns it */
BUG_ON(res->owner == dlm->node_num);
mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
res->lockname.name);
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
mlog(0, "Responding with DLM_RECOVERING!\n");
ret = DLM_RECOVERING;
goto unlock_out;
}
if (res->state & DLM_LOCK_RES_MIGRATING) {
mlog(0, "Responding with DLM_MIGRATING!\n");
ret = DLM_MIGRATING;
goto unlock_out;
}
/* try convert queue for both ast/bast */
head = &res->converting;
lock = NULL;
list_for_each_entry(lock, head, list) {
if (lock->ml.cookie == cookie)
goto do_ast;
}
/* if not on convert, try blocked for ast, granted for bast */
if (past->type == DLM_AST)
head = &res->blocked;
else
head = &res->granted;
list_for_each_entry(lock, head, list) {
if (lock->ml.cookie == cookie)
goto do_ast;
}
mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
"node=%u\n", past->type == DLM_AST ? "" : "b",
dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
locklen, name, node);
ret = DLM_NORMAL;
unlock_out:
spin_unlock(&res->spinlock);
goto leave;
do_ast:
ret = DLM_NORMAL;
if (past->type == DLM_AST) {
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->granted);
mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
dlm->name, res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
lock->ml.type, lock->ml.convert_type);
if (lock->ml.convert_type != LKM_IVMODE) {
lock->ml.type = lock->ml.convert_type;
lock->ml.convert_type = LKM_IVMODE;
} else {
// should already be there....
}
lock->lksb->status = DLM_NORMAL;
/* if we requested the lvb, fetch it into our lksb now */
if (flags & LKM_GET_LVB) {
BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
}
}
spin_unlock(&res->spinlock);
if (past->type == DLM_AST)
dlm_do_local_ast(dlm, res, lock);
else
dlm_do_local_bast(dlm, res, lock, past->blocked_type);
leave:
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return ret;
}
int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock, int msg_type,
int blocked_type, int flags)
{
int ret = 0;
struct dlm_proxy_ast past;
struct kvec vec[2];
size_t veclen = 1;
int status;
mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
blocked_type);
memset(&past, 0, sizeof(struct dlm_proxy_ast));
past.node_idx = dlm->node_num;
past.type = msg_type;
past.blocked_type = blocked_type;
past.namelen = res->lockname.len;
memcpy(past.name, res->lockname.name, past.namelen);
past.cookie = lock->ml.cookie;
vec[0].iov_len = sizeof(struct dlm_proxy_ast);
vec[0].iov_base = &past;
if (flags & DLM_LKSB_GET_LVB) {
be32_add_cpu(&past.flags, LKM_GET_LVB);
vec[1].iov_len = DLM_LVB_LEN;
vec[1].iov_base = lock->lksb->lvb;
veclen++;
}
ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
lock->ml.node, &status);
if (ret < 0)
mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
dlm->name, res->lockname.len, res->lockname.name, ret,
lock->ml.node);
else {
if (status == DLM_RECOVERING) {
mlog(ML_ERROR, "sent AST to node %u, it thinks this "
"node is dead!\n", lock->ml.node);
BUG();
} else if (status == DLM_MIGRATING) {
mlog(ML_ERROR, "sent AST to node %u, it returned "
"DLM_MIGRATING!\n", lock->ml.node);
BUG();
} else if (status != DLM_NORMAL && status != DLM_IVLOCKID) {
mlog(ML_ERROR, "AST to node %u returned %d!\n",
lock->ml.node, status);
/* ignore it */
}
ret = 0;
}
return ret;
}

1150
fs/ocfs2/dlm/dlmcommon.h Normal file

File diff suppressed because it is too large Load diff

544
fs/ocfs2/dlm/dlmconvert.c Normal file
View file

@ -0,0 +1,544 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmconvert.c
*
* underlying calls for lock conversion
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/spinlock.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmconvert.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
/* NOTE: __dlmconvert_master is the only function in here that
* needs a spinlock held on entry (res->spinlock) and it is the
* only one that holds a lock on exit (res->spinlock).
* All other functions in here need no locks and drop all of
* the locks that they acquire. */
static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags,
int type, int *call_ast,
int *kick_thread);
static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type);
/*
* this is only called directly by dlmlock(), and only when the
* local node is the owner of the lockres
* locking:
* caller needs: none
* taken: takes and drops res->spinlock
* held on exit: none
* returns: see __dlmconvert_master
*/
enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type)
{
int call_ast = 0, kick_thread = 0;
enum dlm_status status;
spin_lock(&res->spinlock);
/* we are not in a network handler, this is fine */
__dlm_wait_on_lockres(res);
__dlm_lockres_reserve_ast(res);
res->state |= DLM_LOCK_RES_IN_PROGRESS;
status = __dlmconvert_master(dlm, res, lock, flags, type,
&call_ast, &kick_thread);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
dlm_error(status);
/* either queue the ast or release it */
if (call_ast)
dlm_queue_ast(dlm, lock);
else
dlm_lockres_release_ast(dlm, res);
if (kick_thread)
dlm_kick_thread(dlm, res);
return status;
}
/* performs lock conversion at the lockres master site
* locking:
* caller needs: res->spinlock
* taken: takes and drops lock->spinlock
* held on exit: res->spinlock
* returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
* call_ast: whether ast should be called for this lock
* kick_thread: whether dlm_kick_thread should be called
*/
static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags,
int type, int *call_ast,
int *kick_thread)
{
enum dlm_status status = DLM_NORMAL;
struct dlm_lock *tmplock=NULL;
assert_spin_locked(&res->spinlock);
mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n",
lock->ml.type, lock->ml.convert_type, type);
spin_lock(&lock->spinlock);
/* already converting? */
if (lock->ml.convert_type != LKM_IVMODE) {
mlog(ML_ERROR, "attempted to convert a lock with a lock "
"conversion pending\n");
status = DLM_DENIED;
goto unlock_exit;
}
/* must be on grant queue to convert */
if (!dlm_lock_on_list(&res->granted, lock)) {
mlog(ML_ERROR, "attempted to convert a lock not on grant "
"queue\n");
status = DLM_DENIED;
goto unlock_exit;
}
if (flags & LKM_VALBLK) {
switch (lock->ml.type) {
case LKM_EXMODE:
/* EX + LKM_VALBLK + convert == set lvb */
mlog(0, "will set lvb: converting %s->%s\n",
dlm_lock_mode_name(lock->ml.type),
dlm_lock_mode_name(type));
lock->lksb->flags |= DLM_LKSB_PUT_LVB;
break;
case LKM_PRMODE:
case LKM_NLMODE:
/* refetch if new level is not NL */
if (type > LKM_NLMODE) {
mlog(0, "will fetch new value into "
"lvb: converting %s->%s\n",
dlm_lock_mode_name(lock->ml.type),
dlm_lock_mode_name(type));
lock->lksb->flags |= DLM_LKSB_GET_LVB;
} else {
mlog(0, "will NOT fetch new value "
"into lvb: converting %s->%s\n",
dlm_lock_mode_name(lock->ml.type),
dlm_lock_mode_name(type));
flags &= ~(LKM_VALBLK);
}
break;
}
}
/* in-place downconvert? */
if (type <= lock->ml.type)
goto grant;
/* upconvert from here on */
status = DLM_NORMAL;
list_for_each_entry(tmplock, &res->granted, list) {
if (tmplock == lock)
continue;
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
}
list_for_each_entry(tmplock, &res->converting, list) {
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
/* existing conversion requests take precedence */
if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
goto switch_queues;
}
/* fall thru to grant */
grant:
mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
res->lockname.name, dlm_lock_mode_name(type));
/* immediately grant the new lock type */
lock->lksb->status = DLM_NORMAL;
if (lock->ml.node == dlm->node_num)
mlog(0, "doing in-place convert for nonlocal lock\n");
lock->ml.type = type;
if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
status = DLM_NORMAL;
*call_ast = 1;
goto unlock_exit;
switch_queues:
if (flags & LKM_NOQUEUE) {
mlog(0, "failed to convert NOQUEUE lock %.*s from "
"%d to %d...\n", res->lockname.len, res->lockname.name,
lock->ml.type, type);
status = DLM_NOTQUEUED;
goto unlock_exit;
}
mlog(0, "res %.*s, queueing...\n", res->lockname.len,
res->lockname.name);
lock->ml.convert_type = type;
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->converting);
unlock_exit:
spin_unlock(&lock->spinlock);
if (status == DLM_DENIED) {
__dlm_print_one_lock_resource(res);
}
if (status == DLM_NORMAL)
*kick_thread = 1;
return status;
}
void dlm_revert_pending_convert(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->granted);
lock->ml.convert_type = LKM_IVMODE;
lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
}
/* messages the master site to do lock conversion
* locking:
* caller needs: none
* taken: takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
* held on exit: none
* returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
*/
enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
mlog(0, "bailing out early since res is RECOVERING "
"on secondary queue\n");
/* __dlm_print_one_lock_resource(res); */
status = DLM_RECOVERING;
goto bail;
}
/* will exit this call with spinlock held */
__dlm_wait_on_lockres(res);
if (lock->ml.convert_type != LKM_IVMODE) {
__dlm_print_one_lock_resource(res);
mlog(ML_ERROR, "converting a remote lock that is already "
"converting! (cookie=%u:%llu, conv=%d)\n",
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
lock->ml.convert_type);
status = DLM_DENIED;
goto bail;
}
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->converting);
lock->convert_pending = 1;
lock->ml.convert_type = type;
if (flags & LKM_VALBLK) {
if (lock->ml.type == LKM_EXMODE) {
flags |= LKM_PUT_LVB;
lock->lksb->flags |= DLM_LKSB_PUT_LVB;
} else {
if (lock->ml.convert_type == LKM_NLMODE)
flags &= ~LKM_VALBLK;
else {
flags |= LKM_GET_LVB;
lock->lksb->flags |= DLM_LKSB_GET_LVB;
}
}
}
spin_unlock(&res->spinlock);
/* no locks held here.
* need to wait for a reply as to whether it got queued or not. */
status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->convert_pending = 0;
/* if it failed, move it back to granted queue */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
}
bail:
spin_unlock(&res->spinlock);
/* TODO: should this be a wake_one? */
/* wake up any IN_PROGRESS waiters */
wake_up(&res->wq);
return status;
}
/* sends DLM_CONVERT_LOCK_MSG to master site
* locking:
* caller needs: none
* taken: none
* held on exit: none
* returns: DLM_NOLOCKMGR, status from remote node
*/
static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type)
{
struct dlm_convert_lock convert;
int tmpret;
enum dlm_status ret;
int status = 0;
struct kvec vec[2];
size_t veclen = 1;
mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
memset(&convert, 0, sizeof(struct dlm_convert_lock));
convert.node_idx = dlm->node_num;
convert.requested_type = type;
convert.cookie = lock->ml.cookie;
convert.namelen = res->lockname.len;
convert.flags = cpu_to_be32(flags);
memcpy(convert.name, res->lockname.name, convert.namelen);
vec[0].iov_len = sizeof(struct dlm_convert_lock);
vec[0].iov_base = &convert;
if (flags & LKM_PUT_LVB) {
/* extra data to send if we are updating lvb */
vec[1].iov_len = DLM_LVB_LEN;
vec[1].iov_base = lock->lksb->lvb;
veclen++;
}
tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
vec, veclen, res->owner, &status);
if (tmpret >= 0) {
// successfully sent and received
ret = status; // this is already a dlm_status
if (ret == DLM_RECOVERING) {
mlog(0, "node %u returned DLM_RECOVERING from convert "
"message!\n", res->owner);
} else if (ret == DLM_MIGRATING) {
mlog(0, "node %u returned DLM_MIGRATING from convert "
"message!\n", res->owner);
} else if (ret == DLM_FORWARD) {
mlog(0, "node %u returned DLM_FORWARD from convert "
"message!\n", res->owner);
} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
dlm_error(ret);
} else {
mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
"node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
res->owner);
if (dlm_is_host_down(tmpret)) {
/* instead of logging the same network error over
* and over, sleep here and wait for the heartbeat
* to notice the node is dead. times out after 5s. */
dlm_wait_for_node_death(dlm, res->owner,
DLM_NODE_DEATH_WAIT_MAX);
ret = DLM_RECOVERING;
mlog(0, "node %u died so returning DLM_RECOVERING "
"from convert message!\n", res->owner);
} else {
ret = dlm_err_to_dlm_status(tmpret);
}
}
return ret;
}
/* handler for DLM_CONVERT_LOCK_MSG on master site
* locking:
* caller needs: none
* taken: takes and drop res->spinlock
* held on exit: none
* returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
* status from __dlmconvert_master
*/
int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
struct dlm_lock *lock = NULL;
struct dlm_lock *tmp_lock;
struct dlm_lockstatus *lksb;
enum dlm_status status = DLM_NORMAL;
u32 flags;
int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0;
if (!dlm_grab(dlm)) {
dlm_error(DLM_REJECTED);
return DLM_REJECTED;
}
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
status = DLM_IVBUFLEN;
dlm_error(status);
goto leave;
}
flags = be32_to_cpu(cnv->flags);
if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
(LKM_PUT_LVB|LKM_GET_LVB)) {
mlog(ML_ERROR, "both PUT and GET lvb specified\n");
status = DLM_BADARGS;
goto leave;
}
mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
(flags & LKM_GET_LVB ? "get lvb" : "none"));
status = DLM_IVLOCKID;
res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
if (!res) {
dlm_error(status);
goto leave;
}
spin_lock(&res->spinlock);
status = __dlm_lockres_state_to_status(res);
if (status != DLM_NORMAL) {
spin_unlock(&res->spinlock);
dlm_error(status);
goto leave;
}
list_for_each_entry(tmp_lock, &res->granted, list) {
if (tmp_lock->ml.cookie == cnv->cookie &&
tmp_lock->ml.node == cnv->node_idx) {
lock = tmp_lock;
dlm_lock_get(lock);
break;
}
}
spin_unlock(&res->spinlock);
if (!lock) {
status = DLM_IVLOCKID;
mlog(ML_ERROR, "did not find lock to convert on grant queue! "
"cookie=%u:%llu\n",
dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie)));
dlm_print_one_lock_resource(res);
goto leave;
}
/* found the lock */
lksb = lock->lksb;
/* see if caller needed to get/put lvb */
if (flags & LKM_PUT_LVB) {
BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
lksb->flags |= DLM_LKSB_PUT_LVB;
memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
} else if (flags & LKM_GET_LVB) {
BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
lksb->flags |= DLM_LKSB_GET_LVB;
}
spin_lock(&res->spinlock);
status = __dlm_lockres_state_to_status(res);
if (status == DLM_NORMAL) {
__dlm_lockres_reserve_ast(res);
ast_reserved = 1;
res->state |= DLM_LOCK_RES_IN_PROGRESS;
status = __dlmconvert_master(dlm, res, lock, flags,
cnv->requested_type,
&call_ast, &kick_thread);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
wake = 1;
}
spin_unlock(&res->spinlock);
if (wake)
wake_up(&res->wq);
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
}
leave:
if (lock)
dlm_lock_put(lock);
/* either queue the ast or release it, if reserved */
if (call_ast)
dlm_queue_ast(dlm, lock);
else if (ast_reserved)
dlm_lockres_release_ast(dlm, res);
if (kick_thread)
dlm_kick_thread(dlm, res);
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return status;
}

35
fs/ocfs2/dlm/dlmconvert.h Normal file
View file

@ -0,0 +1,35 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmconvert.h
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef DLMCONVERT_H
#define DLMCONVERT_H
enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type);
enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type);
#endif

1000
fs/ocfs2/dlm/dlmdebug.c Normal file

File diff suppressed because it is too large Load diff

81
fs/ocfs2/dlm/dlmdebug.h Normal file
View file

@ -0,0 +1,81 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmdebug.h
*
* Copyright (C) 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef DLMDEBUG_H
#define DLMDEBUG_H
void dlm_print_one_mle(struct dlm_master_list_entry *mle);
#ifdef CONFIG_DEBUG_FS
struct dlm_debug_ctxt {
struct kref debug_refcnt;
struct dentry *debug_state_dentry;
struct dentry *debug_lockres_dentry;
struct dentry *debug_mle_dentry;
struct dentry *debug_purgelist_dentry;
};
struct debug_lockres {
int dl_len;
char *dl_buf;
struct dlm_ctxt *dl_ctxt;
struct dlm_lock_resource *dl_res;
};
int dlm_debug_init(struct dlm_ctxt *dlm);
void dlm_debug_shutdown(struct dlm_ctxt *dlm);
int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
int dlm_create_debugfs_root(void);
void dlm_destroy_debugfs_root(void);
#else
static inline int dlm_debug_init(struct dlm_ctxt *dlm)
{
return 0;
}
static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
{
}
static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
{
return 0;
}
static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
}
static inline int dlm_create_debugfs_root(void)
{
return 0;
}
static inline void dlm_destroy_debugfs_root(void)
{
}
#endif /* CONFIG_DEBUG_FS */
#endif /* DLMDEBUG_H */

2389
fs/ocfs2/dlm/dlmdomain.c Normal file

File diff suppressed because it is too large Load diff

36
fs/ocfs2/dlm/dlmdomain.h Normal file
View file

@ -0,0 +1,36 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmdomain.h
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef DLMDOMAIN_H
#define DLMDOMAIN_H
extern spinlock_t dlm_domain_lock;
extern struct list_head dlm_domains;
int dlm_joined(struct dlm_ctxt *dlm);
int dlm_shutting_down(struct dlm_ctxt *dlm);
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
int node_num);
#endif

761
fs/ocfs2/dlm/dlmlock.c Normal file
View file

@ -0,0 +1,761 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmlock.c
*
* underlying calls for lock creation
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmconvert.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
static struct kmem_cache *dlm_lock_cache;
static DEFINE_SPINLOCK(dlm_cookie_lock);
static u64 dlm_next_cookie = 1;
static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags);
static void dlm_init_lock(struct dlm_lock *newlock, int type,
u8 node, u64 cookie);
static void dlm_lock_release(struct kref *kref);
static void dlm_lock_detach_lockres(struct dlm_lock *lock);
int dlm_init_lock_cache(void)
{
dlm_lock_cache = kmem_cache_create("o2dlm_lock",
sizeof(struct dlm_lock),
0, SLAB_HWCACHE_ALIGN, NULL);
if (dlm_lock_cache == NULL)
return -ENOMEM;
return 0;
}
void dlm_destroy_lock_cache(void)
{
if (dlm_lock_cache)
kmem_cache_destroy(dlm_lock_cache);
}
/* Tell us whether we can grant a new lock request.
* locking:
* caller needs: res->spinlock
* taken: none
* held on exit: none
* returns: 1 if the lock can be granted, 0 otherwise.
*/
static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
struct dlm_lock *tmplock;
list_for_each_entry(tmplock, &res->granted, list) {
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
}
list_for_each_entry(tmplock, &res->converting, list) {
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
if (!dlm_lock_compatible(tmplock->ml.convert_type,
lock->ml.type))
return 0;
}
return 1;
}
/* performs lock creation at the lockres master site
* locking:
* caller needs: none
* taken: takes and drops res->spinlock
* held on exit: none
* returns: DLM_NORMAL, DLM_NOTQUEUED
*/
static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags)
{
int call_ast = 0, kick_thread = 0;
enum dlm_status status = DLM_NORMAL;
mlog(0, "type=%d\n", lock->ml.type);
spin_lock(&res->spinlock);
/* if called from dlm_create_lock_handler, need to
* ensure it will not sleep in dlm_wait_on_lockres */
status = __dlm_lockres_state_to_status(res);
if (status != DLM_NORMAL &&
lock->ml.node != dlm->node_num) {
/* erf. state changed after lock was dropped. */
spin_unlock(&res->spinlock);
dlm_error(status);
return status;
}
__dlm_wait_on_lockres(res);
__dlm_lockres_reserve_ast(res);
if (dlm_can_grant_new_lock(res, lock)) {
mlog(0, "I can grant this lock right away\n");
/* got it right away */
lock->lksb->status = DLM_NORMAL;
status = DLM_NORMAL;
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->granted);
/* for the recovery lock, we can't allow the ast
* to be queued since the dlmthread is already
* frozen. but the recovery lock is always locked
* with LKM_NOQUEUE so we do not need the ast in
* this special case */
if (!dlm_is_recovery_lock(res->lockname.name,
res->lockname.len)) {
kick_thread = 1;
call_ast = 1;
} else {
mlog(0, "%s: returning DLM_NORMAL to "
"node %u for reco lock\n", dlm->name,
lock->ml.node);
}
} else {
/* for NOQUEUE request, unless we get the
* lock right away, return DLM_NOTQUEUED */
if (flags & LKM_NOQUEUE) {
status = DLM_NOTQUEUED;
if (dlm_is_recovery_lock(res->lockname.name,
res->lockname.len)) {
mlog(0, "%s: returning NOTQUEUED to "
"node %u for reco lock\n", dlm->name,
lock->ml.node);
}
} else {
status = DLM_NORMAL;
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->blocked);
kick_thread = 1;
}
}
spin_unlock(&res->spinlock);
wake_up(&res->wq);
/* either queue the ast or release it */
if (call_ast)
dlm_queue_ast(dlm, lock);
else
dlm_lockres_release_ast(dlm, res);
dlm_lockres_calc_usage(dlm, res);
if (kick_thread)
dlm_kick_thread(dlm, res);
return status;
}
void dlm_revert_pending_lock(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
/* remove from local queue if it failed */
list_del_init(&lock->list);
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
}
/*
* locking:
* caller needs: none
* taken: takes and drops res->spinlock
* held on exit: none
* returns: DLM_DENIED, DLM_RECOVERING, or net status
*/
static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags)
{
enum dlm_status status = DLM_DENIED;
int lockres_changed = 1;
mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n",
lock->ml.type, res->lockname.len,
res->lockname.name, flags);
/*
* Wait if resource is getting recovered, remastered, etc.
* If the resource was remastered and new owner is self, then exit.
*/
spin_lock(&res->spinlock);
__dlm_wait_on_lockres(res);
if (res->owner == dlm->node_num) {
spin_unlock(&res->spinlock);
return DLM_RECOVERING;
}
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* add lock to local (secondary) queue */
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->blocked);
lock->lock_pending = 1;
spin_unlock(&res->spinlock);
/* spec seems to say that you will get DLM_NORMAL when the lock
* has been queued, meaning we need to wait for a reply here. */
status = dlm_send_remote_lock_request(dlm, res, lock, flags);
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->lock_pending = 0;
if (status != DLM_NORMAL) {
if (status == DLM_RECOVERING &&
dlm_is_recovery_lock(res->lockname.name,
res->lockname.len)) {
/* recovery lock was mastered by dead node.
* we need to have calc_usage shoot down this
* lockres and completely remaster it. */
mlog(0, "%s: recovery lock was owned by "
"dead node %u, remaster it now.\n",
dlm->name, res->owner);
} else if (status != DLM_NOTQUEUED) {
/*
* DO NOT call calc_usage, as this would unhash
* the remote lockres before we ever get to use
* it. treat as if we never made any change to
* the lockres.
*/
lockres_changed = 0;
dlm_error(status);
}
dlm_revert_pending_lock(res, lock);
dlm_lock_put(lock);
} else if (dlm_is_recovery_lock(res->lockname.name,
res->lockname.len)) {
/* special case for the $RECOVERY lock.
* there will never be an AST delivered to put
* this lock on the proper secondary queue
* (granted), so do it manually. */
mlog(0, "%s: $RECOVERY lock for this node (%u) is "
"mastered by %u; got lock, manually granting (no ast)\n",
dlm->name, dlm->node_num, res->owner);
list_move_tail(&lock->list, &res->granted);
}
spin_unlock(&res->spinlock);
if (lockres_changed)
dlm_lockres_calc_usage(dlm, res);
wake_up(&res->wq);
return status;
}
/* for remote lock creation.
* locking:
* caller needs: none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
* taken: none
* held on exit: none
* returns: DLM_NOLOCKMGR, or net status
*/
static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags)
{
struct dlm_create_lock create;
int tmpret, status = 0;
enum dlm_status ret;
memset(&create, 0, sizeof(create));
create.node_idx = dlm->node_num;
create.requested_type = lock->ml.type;
create.cookie = lock->ml.cookie;
create.namelen = res->lockname.len;
create.flags = cpu_to_be32(flags);
memcpy(create.name, res->lockname.name, create.namelen);
tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
sizeof(create), res->owner, &status);
if (tmpret >= 0) {
ret = status;
if (ret == DLM_REJECTED) {
mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
"owned by node %u. That node is coming back up "
"currently.\n", dlm->name, create.namelen,
create.name, res->owner);
dlm_print_one_lock_resource(res);
BUG();
}
} else {
mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
"node %u\n", dlm->name, create.namelen, create.name,
tmpret, res->owner);
if (dlm_is_host_down(tmpret))
ret = DLM_RECOVERING;
else
ret = dlm_err_to_dlm_status(tmpret);
}
return ret;
}
void dlm_lock_get(struct dlm_lock *lock)
{
kref_get(&lock->lock_refs);
}
void dlm_lock_put(struct dlm_lock *lock)
{
kref_put(&lock->lock_refs, dlm_lock_release);
}
static void dlm_lock_release(struct kref *kref)
{
struct dlm_lock *lock;
lock = container_of(kref, struct dlm_lock, lock_refs);
BUG_ON(!list_empty(&lock->list));
BUG_ON(!list_empty(&lock->ast_list));
BUG_ON(!list_empty(&lock->bast_list));
BUG_ON(lock->ast_pending);
BUG_ON(lock->bast_pending);
dlm_lock_detach_lockres(lock);
if (lock->lksb_kernel_allocated) {
mlog(0, "freeing kernel-allocated lksb\n");
kfree(lock->lksb);
}
kmem_cache_free(dlm_lock_cache, lock);
}
/* associate a lock with it's lockres, getting a ref on the lockres */
void dlm_lock_attach_lockres(struct dlm_lock *lock,
struct dlm_lock_resource *res)
{
dlm_lockres_get(res);
lock->lockres = res;
}
/* drop ref on lockres, if there is still one associated with lock */
static void dlm_lock_detach_lockres(struct dlm_lock *lock)
{
struct dlm_lock_resource *res;
res = lock->lockres;
if (res) {
lock->lockres = NULL;
mlog(0, "removing lock's lockres reference\n");
dlm_lockres_put(res);
}
}
static void dlm_init_lock(struct dlm_lock *newlock, int type,
u8 node, u64 cookie)
{
INIT_LIST_HEAD(&newlock->list);
INIT_LIST_HEAD(&newlock->ast_list);
INIT_LIST_HEAD(&newlock->bast_list);
spin_lock_init(&newlock->spinlock);
newlock->ml.type = type;
newlock->ml.convert_type = LKM_IVMODE;
newlock->ml.highest_blocked = LKM_IVMODE;
newlock->ml.node = node;
newlock->ml.pad1 = 0;
newlock->ml.list = 0;
newlock->ml.flags = 0;
newlock->ast = NULL;
newlock->bast = NULL;
newlock->astdata = NULL;
newlock->ml.cookie = cpu_to_be64(cookie);
newlock->ast_pending = 0;
newlock->bast_pending = 0;
newlock->convert_pending = 0;
newlock->lock_pending = 0;
newlock->unlock_pending = 0;
newlock->cancel_pending = 0;
newlock->lksb_kernel_allocated = 0;
kref_init(&newlock->lock_refs);
}
struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
struct dlm_lockstatus *lksb)
{
struct dlm_lock *lock;
int kernel_allocated = 0;
lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
if (!lock)
return NULL;
if (!lksb) {
/* zero memory only if kernel-allocated */
lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
if (!lksb) {
kmem_cache_free(dlm_lock_cache, lock);
return NULL;
}
kernel_allocated = 1;
}
dlm_init_lock(lock, type, node, cookie);
if (kernel_allocated)
lock->lksb_kernel_allocated = 1;
lock->lksb = lksb;
lksb->lockid = lock;
return lock;
}
/* handler for lock creation net message
* locking:
* caller needs: none
* taken: takes and drops res->spinlock
* held on exit: none
* returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
*/
int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
struct dlm_lock *newlock = NULL;
struct dlm_lockstatus *lksb = NULL;
enum dlm_status status = DLM_NORMAL;
char *name;
unsigned int namelen;
BUG_ON(!dlm);
if (!dlm_grab(dlm))
return DLM_REJECTED;
name = create->name;
namelen = create->namelen;
status = DLM_REJECTED;
if (!dlm_domain_fully_joined(dlm)) {
mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
"sending a create_lock message for lock %.*s!\n",
dlm->name, create->node_idx, namelen, name);
dlm_error(status);
goto leave;
}
status = DLM_IVBUFLEN;
if (namelen > DLM_LOCKID_NAME_MAX) {
dlm_error(status);
goto leave;
}
status = DLM_SYSERR;
newlock = dlm_new_lock(create->requested_type,
create->node_idx,
be64_to_cpu(create->cookie), NULL);
if (!newlock) {
dlm_error(status);
goto leave;
}
lksb = newlock->lksb;
if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
lksb->flags |= DLM_LKSB_GET_LVB;
mlog(0, "set DLM_LKSB_GET_LVB flag\n");
}
status = DLM_IVLOCKID;
res = dlm_lookup_lockres(dlm, name, namelen);
if (!res) {
dlm_error(status);
goto leave;
}
spin_lock(&res->spinlock);
status = __dlm_lockres_state_to_status(res);
spin_unlock(&res->spinlock);
if (status != DLM_NORMAL) {
mlog(0, "lockres recovering/migrating/in-progress\n");
goto leave;
}
dlm_lock_attach_lockres(newlock, res);
status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
leave:
if (status != DLM_NORMAL)
if (newlock)
dlm_lock_put(newlock);
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return status;
}
/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
{
u64 tmpnode = node_num;
/* shift single byte of node num into top 8 bits */
tmpnode <<= 56;
spin_lock(&dlm_cookie_lock);
*cookie = (dlm_next_cookie | tmpnode);
if (++dlm_next_cookie & 0xff00000000000000ull) {
mlog(0, "This node's cookie will now wrap!\n");
dlm_next_cookie = 1;
}
spin_unlock(&dlm_cookie_lock);
}
enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
struct dlm_lockstatus *lksb, int flags,
const char *name, int namelen, dlm_astlockfunc_t *ast,
void *data, dlm_bastlockfunc_t *bast)
{
enum dlm_status status;
struct dlm_lock_resource *res = NULL;
struct dlm_lock *lock = NULL;
int convert = 0, recovery = 0;
/* yes this function is a mess.
* TODO: clean this up. lots of common code in the
* lock and convert paths, especially in the retry blocks */
if (!lksb) {
dlm_error(DLM_BADARGS);
return DLM_BADARGS;
}
status = DLM_BADPARAM;
if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
dlm_error(status);
goto error;
}
if (flags & ~LKM_VALID_FLAGS) {
dlm_error(status);
goto error;
}
convert = (flags & LKM_CONVERT);
recovery = (flags & LKM_RECOVERY);
if (recovery &&
(!dlm_is_recovery_lock(name, namelen) || convert) ) {
dlm_error(status);
goto error;
}
if (convert && (flags & LKM_LOCAL)) {
mlog(ML_ERROR, "strange LOCAL convert request!\n");
goto error;
}
if (convert) {
/* CONVERT request */
/* if converting, must pass in a valid dlm_lock */
lock = lksb->lockid;
if (!lock) {
mlog(ML_ERROR, "NULL lock pointer in convert "
"request\n");
goto error;
}
res = lock->lockres;
if (!res) {
mlog(ML_ERROR, "NULL lockres pointer in convert "
"request\n");
goto error;
}
dlm_lockres_get(res);
/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
* static after the original lock call. convert requests will
* ensure that everything is the same, or return DLM_BADARGS.
* this means that DLM_DENIED_NOASTS will never be returned.
*/
if (lock->lksb != lksb || lock->ast != ast ||
lock->bast != bast || lock->astdata != data) {
status = DLM_BADARGS;
mlog(ML_ERROR, "new args: lksb=%p, ast=%p, bast=%p, "
"astdata=%p\n", lksb, ast, bast, data);
mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
"astdata=%p\n", lock->lksb, lock->ast,
lock->bast, lock->astdata);
goto error;
}
retry_convert:
dlm_wait_for_recovery(dlm);
if (res->owner == dlm->node_num)
status = dlmconvert_master(dlm, res, lock, flags, mode);
else
status = dlmconvert_remote(dlm, res, lock, flags, mode);
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
status == DLM_FORWARD) {
/* for now, see how this works without sleeping
* and just retry right away. I suspect the reco
* or migration will complete fast enough that
* no waiting will be necessary */
mlog(0, "retrying convert with migration/recovery/"
"in-progress\n");
msleep(100);
goto retry_convert;
}
} else {
u64 tmpcookie;
/* LOCK request */
status = DLM_BADARGS;
if (!name) {
dlm_error(status);
goto error;
}
status = DLM_IVBUFLEN;
if (namelen > DLM_LOCKID_NAME_MAX || namelen < 1) {
dlm_error(status);
goto error;
}
dlm_get_next_cookie(dlm->node_num, &tmpcookie);
lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
if (!lock) {
dlm_error(status);
goto error;
}
if (!recovery)
dlm_wait_for_recovery(dlm);
/* find or create the lock resource */
res = dlm_get_lock_resource(dlm, name, namelen, flags);
if (!res) {
status = DLM_IVLOCKID;
dlm_error(status);
goto error;
}
mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
dlm_lock_attach_lockres(lock, res);
lock->ast = ast;
lock->bast = bast;
lock->astdata = data;
retry_lock:
if (flags & LKM_VALBLK) {
mlog(0, "LKM_VALBLK passed by caller\n");
/* LVB requests for non PR, PW or EX locks are
* ignored. */
if (mode < LKM_PRMODE)
flags &= ~LKM_VALBLK;
else {
flags |= LKM_GET_LVB;
lock->lksb->flags |= DLM_LKSB_GET_LVB;
}
}
if (res->owner == dlm->node_num)
status = dlmlock_master(dlm, res, lock, flags);
else
status = dlmlock_remote(dlm, res, lock, flags);
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
status == DLM_FORWARD) {
msleep(100);
if (recovery) {
if (status != DLM_RECOVERING)
goto retry_lock;
/* wait to see the node go down, then
* drop down and allow the lockres to
* get cleaned up. need to remaster. */
dlm_wait_for_node_death(dlm, res->owner,
DLM_NODE_DEATH_WAIT_MAX);
} else {
dlm_wait_for_recovery(dlm);
goto retry_lock;
}
}
/* Inflight taken in dlm_get_lock_resource() is dropped here */
spin_lock(&res->spinlock);
dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
dlm_lockres_calc_usage(dlm, res);
dlm_kick_thread(dlm, res);
if (status != DLM_NORMAL) {
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
if (status != DLM_NOTQUEUED)
dlm_error(status);
goto error;
}
}
error:
if (status != DLM_NORMAL) {
if (lock && !convert)
dlm_lock_put(lock);
// this is kind of unnecessary
lksb->status = status;
}
/* put lockres ref from the convert path
* or from dlm_get_lock_resource */
if (res)
dlm_lockres_put(res);
return status;
}
EXPORT_SYMBOL_GPL(dlmlock);

3468
fs/ocfs2/dlm/dlmmaster.c Normal file

File diff suppressed because it is too large Load diff

2923
fs/ocfs2/dlm/dlmrecovery.c Normal file

File diff suppressed because it is too large Load diff

756
fs/ocfs2/dlm/dlmthread.c Normal file
View file

@ -0,0 +1,756 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmthread.c
*
* standalone DLM module
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/timer.h>
#include <linux/kthread.h>
#include <linux/delay.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
#include "cluster/masklog.h"
static int dlm_thread(void *data);
static void dlm_flush_asts(struct dlm_ctxt *dlm);
#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num)
/* will exit holding res->spinlock, but may drop in function */
/* waits until flags are cleared on res->state */
void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
{
DECLARE_WAITQUEUE(wait, current);
assert_spin_locked(&res->spinlock);
add_wait_queue(&res->wq, &wait);
repeat:
set_current_state(TASK_UNINTERRUPTIBLE);
if (res->state & flags) {
spin_unlock(&res->spinlock);
schedule();
spin_lock(&res->spinlock);
goto repeat;
}
remove_wait_queue(&res->wq, &wait);
__set_current_state(TASK_RUNNING);
}
int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
{
if (list_empty(&res->granted) &&
list_empty(&res->converting) &&
list_empty(&res->blocked))
return 0;
return 1;
}
/* "unused": the lockres has no locks, is not on the dirty list,
* has no inflight locks (in the gap between mastery and acquiring
* the first lock), and has no bits in its refmap.
* truly ready to be freed. */
int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
int bit;
assert_spin_locked(&res->spinlock);
if (__dlm_lockres_has_locks(res))
return 0;
/* Locks are in the process of being created */
if (res->inflight_locks)
return 0;
if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
return 0;
if (res->state & DLM_LOCK_RES_RECOVERING)
return 0;
/* Another node has this resource with this node as the master */
bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES)
return 0;
return 1;
}
/* Call whenever you may have added or deleted something from one of
* the lockres queue's. This will figure out whether it belongs on the
* unused list or not and does the appropriate thing. */
void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
if (__dlm_lockres_unused(res)){
if (list_empty(&res->purge)) {
mlog(0, "%s: Adding res %.*s to purge list\n",
dlm->name, res->lockname.len, res->lockname.name);
res->last_used = jiffies;
dlm_lockres_get(res);
list_add_tail(&res->purge, &dlm->purge_list);
dlm->purge_count++;
}
} else if (!list_empty(&res->purge)) {
mlog(0, "%s: Removing res %.*s from purge list\n",
dlm->name, res->lockname.len, res->lockname.name);
list_del_init(&res->purge);
dlm_lockres_put(res);
dlm->purge_count--;
}
}
void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
__dlm_lockres_calc_usage(dlm, res);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
}
static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int master;
int ret = 0;
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
master = (res->owner == dlm->node_num);
mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
res->lockname.len, res->lockname.name, master);
if (!master) {
res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
spin_lock(&res->spinlock);
/* This ensures that clear refmap is sent after the set */
__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
spin_unlock(&res->spinlock);
/* clear our bit from the master's refmap, ignore errors */
ret = dlm_drop_lockres_ref(dlm, res);
if (ret < 0) {
if (!dlm_is_host_down(ret))
BUG();
}
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
}
if (!list_empty(&res->purge)) {
mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
dlm->name, res->lockname.len, res->lockname.name, master);
list_del_init(&res->purge);
dlm_lockres_put(res);
dlm->purge_count--;
}
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
dlm->name, res->lockname.len, res->lockname.name);
__dlm_print_one_lock_resource(res);
BUG();
}
__dlm_unhash_lockres(dlm, res);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
res->state &= ~DLM_LOCK_RES_DROPPING_REF;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
} else
spin_unlock(&res->spinlock);
}
static void dlm_run_purge_list(struct dlm_ctxt *dlm,
int purge_now)
{
unsigned int run_max, unused;
unsigned long purge_jiffies;
struct dlm_lock_resource *lockres;
spin_lock(&dlm->spinlock);
run_max = dlm->purge_count;
while(run_max && !list_empty(&dlm->purge_list)) {
run_max--;
lockres = list_entry(dlm->purge_list.next,
struct dlm_lock_resource, purge);
spin_lock(&lockres->spinlock);
purge_jiffies = lockres->last_used +
msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
/* Make sure that we want to be processing this guy at
* this time. */
if (!purge_now && time_after(purge_jiffies, jiffies)) {
/* Since resources are added to the purge list
* in tail order, we can stop at the first
* unpurgable resource -- anyone added after
* him will have a greater last_used value */
spin_unlock(&lockres->spinlock);
break;
}
/* Status of the lockres *might* change so double
* check. If the lockres is unused, holding the dlm
* spinlock will prevent people from getting and more
* refs on it. */
unused = __dlm_lockres_unused(lockres);
if (!unused ||
(lockres->state & DLM_LOCK_RES_MIGRATING) ||
(lockres->inflight_assert_workers != 0)) {
mlog(0, "%s: res %.*s is in use or being remastered, "
"used %d, state %d, assert master workers %u\n",
dlm->name, lockres->lockname.len,
lockres->lockname.name,
!unused, lockres->state,
lockres->inflight_assert_workers);
list_move_tail(&lockres->purge, &dlm->purge_list);
spin_unlock(&lockres->spinlock);
continue;
}
dlm_lockres_get(lockres);
dlm_purge_lockres(dlm, lockres);
dlm_lockres_put(lockres);
/* Avoid adding any scheduling latencies */
cond_resched_lock(&dlm->spinlock);
}
spin_unlock(&dlm->spinlock);
}
static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
struct dlm_lock *lock, *target;
int can_grant = 1;
/*
* Because this function is called with the lockres
* spinlock, and because we know that it is not migrating/
* recovering/in-progress, it is fine to reserve asts and
* basts right before queueing them all throughout
*/
assert_spin_locked(&dlm->ast_lock);
assert_spin_locked(&res->spinlock);
BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
DLM_LOCK_RES_RECOVERING|
DLM_LOCK_RES_IN_PROGRESS)));
converting:
if (list_empty(&res->converting))
goto blocked;
mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
res->lockname.len, res->lockname.name);
target = list_entry(res->converting.next, struct dlm_lock, list);
if (target->ml.convert_type == LKM_IVMODE) {
mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
dlm->name, res->lockname.len, res->lockname.name);
BUG();
}
list_for_each_entry(lock, &res->granted, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
target->ml.convert_type)) {
can_grant = 0;
/* queue the BAST if not already */
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
__dlm_queue_bast(dlm, lock);
}
/* update the highest_blocked if needed */
if (lock->ml.highest_blocked < target->ml.convert_type)
lock->ml.highest_blocked =
target->ml.convert_type;
}
}
list_for_each_entry(lock, &res->converting, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
target->ml.convert_type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
__dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.convert_type)
lock->ml.highest_blocked =
target->ml.convert_type;
}
}
/* we can convert the lock */
if (can_grant) {
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
"%d => %d, node %u\n", dlm->name, res->lockname.len,
res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
target->ml.type,
target->ml.convert_type, target->ml.node);
target->ml.type = target->ml.convert_type;
target->ml.convert_type = LKM_IVMODE;
list_move_tail(&target->list, &res->granted);
BUG_ON(!target->lksb);
target->lksb->status = DLM_NORMAL;
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
__dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
blocked:
if (list_empty(&res->blocked))
goto leave;
target = list_entry(res->blocked.next, struct dlm_lock, list);
list_for_each_entry(lock, &res->granted, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
__dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
}
}
list_for_each_entry(lock, &res->converting, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
__dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
}
}
/* we can grant the blocked lock (only
* possible if converting list empty) */
if (can_grant) {
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
"node %u\n", dlm->name, res->lockname.len,
res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
target->ml.type, target->ml.node);
/* target->ml.type is already correct */
list_move_tail(&target->list, &res->granted);
BUG_ON(!target->lksb);
target->lksb->status = DLM_NORMAL;
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
__dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
leave:
return;
}
/* must have NO locks when calling this with res !=NULL * */
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
if (res) {
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
__dlm_dirty_lockres(dlm, res);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
}
wake_up(&dlm->dlm_thread_wq);
}
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
/* don't shuffle secondary queues */
if ((res->owner == dlm->node_num)) {
if (res->state & (DLM_LOCK_RES_MIGRATING |
DLM_LOCK_RES_BLOCK_DIRTY))
return;
if (list_empty(&res->dirty)) {
/* ref for dirty_list */
dlm_lockres_get(res);
list_add_tail(&res->dirty, &dlm->dirty_list);
res->state |= DLM_LOCK_RES_DIRTY;
}
}
mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
res->lockname.name);
}
/* Launch the NM thread for the mounted volume */
int dlm_launch_thread(struct dlm_ctxt *dlm)
{
mlog(0, "Starting dlm_thread...\n");
dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
if (IS_ERR(dlm->dlm_thread_task)) {
mlog_errno(PTR_ERR(dlm->dlm_thread_task));
dlm->dlm_thread_task = NULL;
return -EINVAL;
}
return 0;
}
void dlm_complete_thread(struct dlm_ctxt *dlm)
{
if (dlm->dlm_thread_task) {
mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
kthread_stop(dlm->dlm_thread_task);
dlm->dlm_thread_task = NULL;
}
}
static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
{
int empty;
spin_lock(&dlm->spinlock);
empty = list_empty(&dlm->dirty_list);
spin_unlock(&dlm->spinlock);
return empty;
}
static void dlm_flush_asts(struct dlm_ctxt *dlm)
{
int ret;
struct dlm_lock *lock;
struct dlm_lock_resource *res;
u8 hi;
spin_lock(&dlm->ast_lock);
while (!list_empty(&dlm->pending_asts)) {
lock = list_entry(dlm->pending_asts.next,
struct dlm_lock, ast_list);
/* get an extra ref on lock */
dlm_lock_get(lock);
res = lock->lockres;
mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
"node %u\n", dlm->name, res->lockname.len,
res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
lock->ml.type, lock->ml.node);
BUG_ON(!lock->ast_pending);
/* remove from list (including ref) */
list_del_init(&lock->ast_list);
dlm_lock_put(lock);
spin_unlock(&dlm->ast_lock);
if (lock->ml.node != dlm->node_num) {
ret = dlm_do_remote_ast(dlm, res, lock);
if (ret < 0)
mlog_errno(ret);
} else
dlm_do_local_ast(dlm, res, lock);
spin_lock(&dlm->ast_lock);
/* possible that another ast was queued while
* we were delivering the last one */
if (!list_empty(&lock->ast_list)) {
mlog(0, "%s: res %.*s, AST queued while flushing last "
"one\n", dlm->name, res->lockname.len,
res->lockname.name);
} else
lock->ast_pending = 0;
/* drop the extra ref.
* this may drop it completely. */
dlm_lock_put(lock);
dlm_lockres_release_ast(dlm, res);
}
while (!list_empty(&dlm->pending_basts)) {
lock = list_entry(dlm->pending_basts.next,
struct dlm_lock, bast_list);
/* get an extra ref on lock */
dlm_lock_get(lock);
res = lock->lockres;
BUG_ON(!lock->bast_pending);
/* get the highest blocked lock, and reset */
spin_lock(&lock->spinlock);
BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
hi = lock->ml.highest_blocked;
lock->ml.highest_blocked = LKM_IVMODE;
spin_unlock(&lock->spinlock);
/* remove from list (including ref) */
list_del_init(&lock->bast_list);
dlm_lock_put(lock);
spin_unlock(&dlm->ast_lock);
mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
"blocked %d, node %u\n",
dlm->name, res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
hi, lock->ml.node);
if (lock->ml.node != dlm->node_num) {
ret = dlm_send_proxy_bast(dlm, res, lock, hi);
if (ret < 0)
mlog_errno(ret);
} else
dlm_do_local_bast(dlm, res, lock, hi);
spin_lock(&dlm->ast_lock);
/* possible that another bast was queued while
* we were delivering the last one */
if (!list_empty(&lock->bast_list)) {
mlog(0, "%s: res %.*s, BAST queued while flushing last "
"one\n", dlm->name, res->lockname.len,
res->lockname.name);
} else
lock->bast_pending = 0;
/* drop the extra ref.
* this may drop it completely. */
dlm_lock_put(lock);
dlm_lockres_release_ast(dlm, res);
}
wake_up(&dlm->ast_wq);
spin_unlock(&dlm->ast_lock);
}
#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
#define DLM_THREAD_MAX_DIRTY 100
#define DLM_THREAD_MAX_ASTS 10
static int dlm_thread(void *data)
{
struct dlm_lock_resource *res;
struct dlm_ctxt *dlm = data;
unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
mlog(0, "dlm thread running for %s...\n", dlm->name);
while (!kthread_should_stop()) {
int n = DLM_THREAD_MAX_DIRTY;
/* dlm_shutting_down is very point-in-time, but that
* doesn't matter as we'll just loop back around if we
* get false on the leading edge of a state
* transition. */
dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
/* We really don't want to hold dlm->spinlock while
* calling dlm_shuffle_lists on each lockres that
* needs to have its queues adjusted and AST/BASTs
* run. So let's pull each entry off the dirty_list
* and drop dlm->spinlock ASAP. Once off the list,
* res->spinlock needs to be taken again to protect
* the queues while calling dlm_shuffle_lists. */
spin_lock(&dlm->spinlock);
while (!list_empty(&dlm->dirty_list)) {
int delay = 0;
res = list_entry(dlm->dirty_list.next,
struct dlm_lock_resource, dirty);
/* peel a lockres off, remove it from the list,
* unset the dirty flag and drop the dlm lock */
BUG_ON(!res);
dlm_lockres_get(res);
spin_lock(&res->spinlock);
/* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */
list_del_init(&res->dirty);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
/* Drop dirty_list ref */
dlm_lockres_put(res);
/* lockres can be re-dirtied/re-added to the
* dirty_list in this gap, but that is ok */
spin_lock(&dlm->ast_lock);
spin_lock(&res->spinlock);
if (res->owner != dlm->node_num) {
__dlm_print_one_lock_resource(res);
mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
" dirty %d\n", dlm->name,
!!(res->state & DLM_LOCK_RES_IN_PROGRESS),
!!(res->state & DLM_LOCK_RES_MIGRATING),
!!(res->state & DLM_LOCK_RES_RECOVERING),
!!(res->state & DLM_LOCK_RES_DIRTY));
}
BUG_ON(res->owner != dlm->node_num);
/* it is now ok to move lockreses in these states
* to the dirty list, assuming that they will only be
* dirty for a short while. */
BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
DLM_LOCK_RES_RECOVERING)) {
/* move it to the tail and keep going */
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
spin_unlock(&dlm->ast_lock);
mlog(0, "%s: res %.*s, inprogress, delay list "
"shuffle, state %d\n", dlm->name,
res->lockname.len, res->lockname.name,
res->state);
delay = 1;
goto in_progress;
}
/* at this point the lockres is not migrating/
* recovering/in-progress. we have the lockres
* spinlock and do NOT have the dlm lock.
* safe to reserve/queue asts and run the lists. */
/* called while holding lockres lock */
dlm_shuffle_lists(dlm, res);
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
spin_unlock(&dlm->ast_lock);
dlm_lockres_calc_usage(dlm, res);
in_progress:
spin_lock(&dlm->spinlock);
/* if the lock was in-progress, stick
* it on the back of the list */
if (delay) {
spin_lock(&res->spinlock);
__dlm_dirty_lockres(dlm, res);
spin_unlock(&res->spinlock);
}
dlm_lockres_put(res);
/* unlikely, but we may need to give time to
* other tasks */
if (!--n) {
mlog(0, "%s: Throttling dlm thread\n",
dlm->name);
break;
}
}
spin_unlock(&dlm->spinlock);
dlm_flush_asts(dlm);
/* yield and continue right away if there is more work to do */
if (!n) {
cond_resched();
continue;
}
wait_event_interruptible_timeout(dlm->dlm_thread_wq,
!dlm_dirty_list_empty(dlm) ||
kthread_should_stop(),
timeout);
}
mlog(0, "quitting DLM thread\n");
return 0;
}

698
fs/ocfs2/dlm/dlmunlock.c Normal file
View file

@ -0,0 +1,698 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmunlock.c
*
* underlying calls for unlocking locks
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
#define DLM_UNLOCK_FREE_LOCK 0x00000001
#define DLM_UNLOCK_CALL_AST 0x00000002
#define DLM_UNLOCK_REMOVE_LOCK 0x00000004
#define DLM_UNLOCK_REGRANT_LOCK 0x00000008
#define DLM_UNLOCK_CLEAR_CONVERT_TYPE 0x00000010
static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int *actions);
static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int *actions);
static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int flags,
u8 owner);
/*
* according to the spec:
* http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
*
* flags & LKM_CANCEL != 0: must be converting or blocked
* flags & LKM_CANCEL == 0: must be granted
*
* So to unlock a converting lock, you must first cancel the
* convert (passing LKM_CANCEL in flags), then call the unlock
* again (with no LKM_CANCEL in flags).
*/
/*
* locking:
* caller needs: none
* taken: res->spinlock and lock->spinlock taken and dropped
* held on exit: none
* returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
* all callers should have taken an extra ref on lock coming in
*/
static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int flags, int *call_ast,
int master_node)
{
enum dlm_status status;
int actions = 0;
int in_use;
u8 owner;
mlog(0, "master_node = %d, valblk = %d\n", master_node,
flags & LKM_VALBLK);
if (master_node)
BUG_ON(res->owner != dlm->node_num);
else
BUG_ON(res->owner == dlm->node_num);
spin_lock(&dlm->ast_lock);
/* We want to be sure that we're not freeing a lock
* that still has AST's pending... */
in_use = !list_empty(&lock->ast_list);
spin_unlock(&dlm->ast_lock);
if (in_use && !(flags & LKM_CANCEL)) {
mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
"while waiting for an ast!", res->lockname.len,
res->lockname.name);
return DLM_BADPARAM;
}
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
if (master_node && !(flags & LKM_CANCEL)) {
mlog(ML_ERROR, "lockres in progress!\n");
spin_unlock(&res->spinlock);
return DLM_FORWARD;
}
/* ok for this to sleep if not in a network handler */
__dlm_wait_on_lockres(res);
res->state |= DLM_LOCK_RES_IN_PROGRESS;
}
spin_lock(&lock->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
status = DLM_RECOVERING;
goto leave;
}
if (res->state & DLM_LOCK_RES_MIGRATING) {
status = DLM_MIGRATING;
goto leave;
}
/* see above for what the spec says about
* LKM_CANCEL and the lock queue state */
if (flags & LKM_CANCEL)
status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
else
status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
if (status != DLM_NORMAL && (status != DLM_CANCELGRANT || !master_node))
goto leave;
/* By now this has been masked out of cancel requests. */
if (flags & LKM_VALBLK) {
/* make the final update to the lvb */
if (master_node)
memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
else
flags |= LKM_PUT_LVB; /* let the send function
* handle it. */
}
if (!master_node) {
owner = res->owner;
/* drop locks and send message */
if (flags & LKM_CANCEL)
lock->cancel_pending = 1;
else
lock->unlock_pending = 1;
spin_unlock(&lock->spinlock);
spin_unlock(&res->spinlock);
status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
flags, owner);
spin_lock(&res->spinlock);
spin_lock(&lock->spinlock);
/* if the master told us the lock was already granted,
* let the ast handle all of these actions */
if (status == DLM_CANCELGRANT) {
actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
DLM_UNLOCK_REGRANT_LOCK|
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
} else if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
status == DLM_FORWARD ||
status == DLM_NOLOCKMGR
) {
/* must clear the actions because this unlock
* is about to be retried. cannot free or do
* any list manipulation. */
mlog(0, "%s:%.*s: clearing actions, %s\n",
dlm->name, res->lockname.len,
res->lockname.name,
status==DLM_RECOVERING?"recovering":
(status==DLM_MIGRATING?"migrating":
(status == DLM_FORWARD ? "forward" :
"nolockmanager")));
actions = 0;
}
if (flags & LKM_CANCEL)
lock->cancel_pending = 0;
else
lock->unlock_pending = 0;
}
/* get an extra ref on lock. if we are just switching
* lists here, we dont want the lock to go away. */
dlm_lock_get(lock);
if (actions & DLM_UNLOCK_REMOVE_LOCK) {
list_del_init(&lock->list);
dlm_lock_put(lock);
}
if (actions & DLM_UNLOCK_REGRANT_LOCK) {
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->granted);
}
if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
mlog(0, "clearing convert_type at %smaster node\n",
master_node ? "" : "non-");
lock->ml.convert_type = LKM_IVMODE;
}
/* remove the extra ref on lock */
dlm_lock_put(lock);
leave:
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
if (!dlm_lock_on_list(&res->converting, lock))
BUG_ON(lock->ml.convert_type != LKM_IVMODE);
else
BUG_ON(lock->ml.convert_type == LKM_IVMODE);
spin_unlock(&lock->spinlock);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
/* let the caller's final dlm_lock_put handle the actual kfree */
if (actions & DLM_UNLOCK_FREE_LOCK) {
/* this should always be coupled with list removal */
BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
atomic_read(&lock->lock_refs.refcount)-1);
dlm_lock_put(lock);
}
if (actions & DLM_UNLOCK_CALL_AST)
*call_ast = 1;
/* if cancel or unlock succeeded, lvb work is done */
if (status == DLM_NORMAL)
lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
return status;
}
void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
/* leave DLM_LKSB_PUT_LVB on the lksb so any final
* update of the lvb will be sent to the new master */
list_del_init(&lock->list);
}
void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
list_move_tail(&lock->list, &res->granted);
lock->ml.convert_type = LKM_IVMODE;
}
static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int flags,
int *call_ast)
{
return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
}
static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int flags, int *call_ast)
{
return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
}
/*
* locking:
* caller needs: none
* taken: none
* held on exit: none
* returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
*/
static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int flags,
u8 owner)
{
struct dlm_unlock_lock unlock;
int tmpret;
enum dlm_status ret;
int status = 0;
struct kvec vec[2];
size_t veclen = 1;
mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
if (owner == dlm->node_num) {
/* ended up trying to contact ourself. this means
* that the lockres had been remote but became local
* via a migration. just retry it, now as local */
mlog(0, "%s:%.*s: this node became the master due to a "
"migration, re-evaluate now\n", dlm->name,
res->lockname.len, res->lockname.name);
return DLM_FORWARD;
}
memset(&unlock, 0, sizeof(unlock));
unlock.node_idx = dlm->node_num;
unlock.flags = cpu_to_be32(flags);
unlock.cookie = lock->ml.cookie;
unlock.namelen = res->lockname.len;
memcpy(unlock.name, res->lockname.name, unlock.namelen);
vec[0].iov_len = sizeof(struct dlm_unlock_lock);
vec[0].iov_base = &unlock;
if (flags & LKM_PUT_LVB) {
/* extra data to send if we are updating lvb */
vec[1].iov_len = DLM_LVB_LEN;
vec[1].iov_base = lock->lksb->lvb;
veclen++;
}
tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
vec, veclen, owner, &status);
if (tmpret >= 0) {
// successfully sent and received
if (status == DLM_FORWARD)
mlog(0, "master was in-progress. retry\n");
ret = status;
} else {
mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
"node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
if (dlm_is_host_down(tmpret)) {
/* NOTE: this seems strange, but it is what we want.
* when the master goes down during a cancel or
* unlock, the recovery code completes the operation
* as if the master had not died, then passes the
* updated state to the recovery master. this thread
* just needs to finish out the operation and call
* the unlockast. */
if (dlm_is_node_dead(dlm, owner))
ret = DLM_NORMAL;
else
ret = DLM_NOLOCKMGR;
} else {
/* something bad. this will BUG in ocfs2 */
ret = dlm_err_to_dlm_status(tmpret);
}
}
return ret;
}
/*
* locking:
* caller needs: none
* taken: takes and drops res->spinlock
* held on exit: none
* returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
* return value from dlmunlock_master
*/
int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
struct dlm_lock *lock = NULL;
enum dlm_status status = DLM_NORMAL;
int found = 0, i;
struct dlm_lockstatus *lksb = NULL;
int ignore;
u32 flags;
struct list_head *queue;
flags = be32_to_cpu(unlock->flags);
if (flags & LKM_GET_LVB) {
mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n");
return DLM_BADARGS;
}
if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL "
"request!\n");
return DLM_BADARGS;
}
if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
return DLM_IVBUFLEN;
}
if (!dlm_grab(dlm))
return DLM_REJECTED;
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
if (!res) {
/* We assume here that a no lock resource simply means
* it was migrated away and destroyed before the other
* node could detect it. */
mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
status = DLM_FORWARD;
goto not_found;
}
queue=&res->granted;
found = 0;
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
spin_unlock(&res->spinlock);
mlog(0, "returning DLM_RECOVERING\n");
status = DLM_RECOVERING;
goto leave;
}
if (res->state & DLM_LOCK_RES_MIGRATING) {
spin_unlock(&res->spinlock);
mlog(0, "returning DLM_MIGRATING\n");
status = DLM_MIGRATING;
goto leave;
}
if (res->owner != dlm->node_num) {
spin_unlock(&res->spinlock);
mlog(0, "returning DLM_FORWARD -- not master\n");
status = DLM_FORWARD;
goto leave;
}
for (i=0; i<3; i++) {
list_for_each_entry(lock, queue, list) {
if (lock->ml.cookie == unlock->cookie &&
lock->ml.node == unlock->node_idx) {
dlm_lock_get(lock);
found = 1;
break;
}
}
if (found)
break;
/* scan granted -> converting -> blocked queues */
queue++;
}
spin_unlock(&res->spinlock);
if (!found) {
status = DLM_IVLOCKID;
goto not_found;
}
/* lock was found on queue */
lksb = lock->lksb;
if (flags & (LKM_VALBLK|LKM_PUT_LVB) &&
lock->ml.type != LKM_EXMODE)
flags &= ~(LKM_VALBLK|LKM_PUT_LVB);
/* unlockast only called on originating node */
if (flags & LKM_PUT_LVB) {
lksb->flags |= DLM_LKSB_PUT_LVB;
memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
}
/* if this is in-progress, propagate the DLM_FORWARD
* all the way back out */
status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
if (status == DLM_FORWARD)
mlog(0, "lockres is in progress\n");
if (flags & LKM_PUT_LVB)
lksb->flags &= ~DLM_LKSB_PUT_LVB;
dlm_lockres_calc_usage(dlm, res);
dlm_kick_thread(dlm, res);
not_found:
if (!found)
mlog(ML_ERROR, "failed to find lock to unlock! "
"cookie=%u:%llu\n",
dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie)));
else
dlm_lock_put(lock);
leave:
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return status;
}
static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int *actions)
{
enum dlm_status status;
if (dlm_lock_on_list(&res->blocked, lock)) {
/* cancel this outright */
status = DLM_NORMAL;
*actions = (DLM_UNLOCK_CALL_AST |
DLM_UNLOCK_REMOVE_LOCK);
} else if (dlm_lock_on_list(&res->converting, lock)) {
/* cancel the request, put back on granted */
status = DLM_NORMAL;
*actions = (DLM_UNLOCK_CALL_AST |
DLM_UNLOCK_REMOVE_LOCK |
DLM_UNLOCK_REGRANT_LOCK |
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
} else if (dlm_lock_on_list(&res->granted, lock)) {
/* too late, already granted. */
status = DLM_CANCELGRANT;
*actions = DLM_UNLOCK_CALL_AST;
} else {
mlog(ML_ERROR, "lock to cancel is not on any list!\n");
status = DLM_IVLOCKID;
*actions = 0;
}
return status;
}
static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int *actions)
{
enum dlm_status status;
/* unlock request */
if (!dlm_lock_on_list(&res->granted, lock)) {
status = DLM_DENIED;
dlm_error(status);
*actions = 0;
} else {
/* unlock granted lock */
status = DLM_NORMAL;
*actions = (DLM_UNLOCK_FREE_LOCK |
DLM_UNLOCK_CALL_AST |
DLM_UNLOCK_REMOVE_LOCK);
}
return status;
}
/* there seems to be no point in doing this async
* since (even for the remote case) there is really
* no work to queue up... so just do it and fire the
* unlockast by hand when done... */
enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
int flags, dlm_astunlockfunc_t *unlockast, void *data)
{
enum dlm_status status;
struct dlm_lock_resource *res;
struct dlm_lock *lock = NULL;
int call_ast, is_master;
if (!lksb) {
dlm_error(DLM_BADARGS);
return DLM_BADARGS;
}
if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
dlm_error(DLM_BADPARAM);
return DLM_BADPARAM;
}
if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
flags &= ~LKM_VALBLK;
}
if (!lksb->lockid || !lksb->lockid->lockres) {
dlm_error(DLM_BADPARAM);
return DLM_BADPARAM;
}
lock = lksb->lockid;
BUG_ON(!lock);
dlm_lock_get(lock);
res = lock->lockres;
BUG_ON(!res);
dlm_lockres_get(res);
retry:
call_ast = 0;
/* need to retry up here because owner may have changed */
mlog(0, "lock=%p res=%p\n", lock, res);
spin_lock(&res->spinlock);
is_master = (res->owner == dlm->node_num);
if (flags & LKM_VALBLK && lock->ml.type != LKM_EXMODE)
flags &= ~LKM_VALBLK;
spin_unlock(&res->spinlock);
if (is_master) {
status = dlmunlock_master(dlm, res, lock, lksb, flags,
&call_ast);
mlog(0, "done calling dlmunlock_master: returned %d, "
"call_ast is %d\n", status, call_ast);
} else {
status = dlmunlock_remote(dlm, res, lock, lksb, flags,
&call_ast);
mlog(0, "done calling dlmunlock_remote: returned %d, "
"call_ast is %d\n", status, call_ast);
}
if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
status == DLM_FORWARD ||
status == DLM_NOLOCKMGR) {
/* We want to go away for a tiny bit to allow recovery
* / migration to complete on this resource. I don't
* know of any wait queue we could sleep on as this
* may be happening on another node. Perhaps the
* proper solution is to queue up requests on the
* other end? */
/* do we want to yield(); ?? */
msleep(50);
mlog(0, "retrying unlock due to pending recovery/"
"migration/in-progress/reconnect\n");
goto retry;
}
if (call_ast) {
mlog(0, "calling unlockast(%p, %d)\n", data, status);
if (is_master) {
/* it is possible that there is one last bast
* pending. make sure it is flushed, then
* call the unlockast.
* not an issue if this is a mastered remotely,
* since this lock has been removed from the
* lockres queues and cannot be found. */
dlm_kick_thread(dlm, NULL);
wait_event(dlm->ast_wq,
dlm_lock_basts_flushed(dlm, lock));
}
(*unlockast)(data, status);
}
if (status == DLM_CANCELGRANT)
status = DLM_NORMAL;
if (status == DLM_NORMAL) {
mlog(0, "kicking the thread\n");
dlm_kick_thread(dlm, res);
} else
dlm_error(status);
dlm_lockres_calc_usage(dlm, res);
dlm_lockres_put(res);
dlm_lock_put(lock);
mlog(0, "returning status=%d!\n", status);
return status;
}
EXPORT_SYMBOL_GPL(dlmunlock);

5
fs/ocfs2/dlmfs/Makefile Normal file
View file

@ -0,0 +1,5 @@
ccflags-y := -Ifs/ocfs2
obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
ocfs2_dlmfs-objs := userdlm.o dlmfs.o

704
fs/ocfs2/dlmfs/dlmfs.c Normal file
View file

@ -0,0 +1,704 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmfs.c
*
* Code which implements the kernel side of a minimal userspace
* interface to our DLM. This file handles the virtual file system
* used for communication with userspace. Credit should go to ramfs,
* which was a template for the fs side of this module.
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
/* Simple VFS hooks based on: */
/*
* Resizable simple ram filesystem for Linux.
*
* Copyright (C) 2000 Linus Torvalds.
* 2000 Transmeta Corp.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/poll.h>
#include <asm/uaccess.h>
#include "stackglue.h"
#include "userdlm.h"
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
static const struct super_operations dlmfs_ops;
static const struct file_operations dlmfs_file_operations;
static const struct inode_operations dlmfs_dir_inode_operations;
static const struct inode_operations dlmfs_root_inode_operations;
static const struct inode_operations dlmfs_file_inode_operations;
static struct kmem_cache *dlmfs_inode_cache;
struct workqueue_struct *user_dlm_worker;
/*
* These are the ABI capabilities of dlmfs.
*
* Over time, dlmfs has added some features that were not part of the
* initial ABI. Unfortunately, some of these features are not detectable
* via standard usage. For example, Linux's default poll always returns
* POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
* added poll support. Instead, we provide this list of new capabilities.
*
* Capabilities is a read-only attribute. We do it as a module parameter
* so we can discover it whether dlmfs is built in, loaded, or even not
* loaded.
*
* The ABI features are local to this machine's dlmfs mount. This is
* distinct from the locking protocol, which is concerned with inter-node
* interaction.
*
* Capabilities:
* - bast : POLLIN against the file descriptor of a held lock
* signifies a bast fired on the lock.
*/
#define DLMFS_CAPABILITIES "bast stackglue"
static int param_set_dlmfs_capabilities(const char *val,
struct kernel_param *kp)
{
printk(KERN_ERR "%s: readonly parameter\n", kp->name);
return -EINVAL;
}
static int param_get_dlmfs_capabilities(char *buffer,
struct kernel_param *kp)
{
return strlcpy(buffer, DLMFS_CAPABILITIES,
strlen(DLMFS_CAPABILITIES) + 1);
}
module_param_call(capabilities, param_set_dlmfs_capabilities,
param_get_dlmfs_capabilities, NULL, 0444);
MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
/*
* decodes a set of open flags into a valid lock level and a set of flags.
* returns < 0 if we have invalid flags
* flags which mean something to us:
* O_RDONLY -> PRMODE level
* O_WRONLY -> EXMODE level
*
* O_NONBLOCK -> NOQUEUE
*/
static int dlmfs_decode_open_flags(int open_flags,
int *level,
int *flags)
{
if (open_flags & (O_WRONLY|O_RDWR))
*level = DLM_LOCK_EX;
else
*level = DLM_LOCK_PR;
*flags = 0;
if (open_flags & O_NONBLOCK)
*flags |= DLM_LKF_NOQUEUE;
return 0;
}
static int dlmfs_file_open(struct inode *inode,
struct file *file)
{
int status, level, flags;
struct dlmfs_filp_private *fp = NULL;
struct dlmfs_inode_private *ip;
if (S_ISDIR(inode->i_mode))
BUG();
mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
file->f_flags);
status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
if (status < 0)
goto bail;
/* We don't want to honor O_APPEND at read/write time as it
* doesn't make sense for LVB writes. */
file->f_flags &= ~O_APPEND;
fp = kmalloc(sizeof(*fp), GFP_NOFS);
if (!fp) {
status = -ENOMEM;
goto bail;
}
fp->fp_lock_level = level;
ip = DLMFS_I(inode);
status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
if (status < 0) {
/* this is a strange error to return here but I want
* to be able userspace to be able to distinguish a
* valid lock request from one that simply couldn't be
* granted. */
if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
status = -ETXTBSY;
kfree(fp);
goto bail;
}
file->private_data = fp;
bail:
return status;
}
static int dlmfs_file_release(struct inode *inode,
struct file *file)
{
int level, status;
struct dlmfs_inode_private *ip = DLMFS_I(inode);
struct dlmfs_filp_private *fp = file->private_data;
if (S_ISDIR(inode->i_mode))
BUG();
mlog(0, "close called on inode %lu\n", inode->i_ino);
status = 0;
if (fp) {
level = fp->fp_lock_level;
if (level != DLM_LOCK_IV)
user_dlm_cluster_unlock(&ip->ip_lockres, level);
kfree(fp);
file->private_data = NULL;
}
return 0;
}
/*
* We do ->setattr() just to override size changes. Our size is the size
* of the LVB and nothing else.
*/
static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
{
int error;
struct inode *inode = dentry->d_inode;
attr->ia_valid &= ~ATTR_SIZE;
error = inode_change_ok(inode, attr);
if (error)
return error;
setattr_copy(inode, attr);
mark_inode_dirty(inode);
return 0;
}
static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
{
int event = 0;
struct inode *inode = file_inode(file);
struct dlmfs_inode_private *ip = DLMFS_I(inode);
poll_wait(file, &ip->ip_lockres.l_event, wait);
spin_lock(&ip->ip_lockres.l_lock);
if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
event = POLLIN | POLLRDNORM;
spin_unlock(&ip->ip_lockres.l_lock);
return event;
}
static ssize_t dlmfs_file_read(struct file *filp,
char __user *buf,
size_t count,
loff_t *ppos)
{
int bytes_left;
ssize_t readlen, got;
char *lvb_buf;
struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
if (*ppos >= i_size_read(inode))
return 0;
if (!count)
return 0;
if (!access_ok(VERIFY_WRITE, buf, count))
return -EFAULT;
/* don't read past the lvb */
if ((count + *ppos) > i_size_read(inode))
readlen = i_size_read(inode) - *ppos;
else
readlen = count;
lvb_buf = kmalloc(readlen, GFP_NOFS);
if (!lvb_buf)
return -ENOMEM;
got = user_dlm_read_lvb(inode, lvb_buf, readlen);
if (got) {
BUG_ON(got != readlen);
bytes_left = __copy_to_user(buf, lvb_buf, readlen);
readlen -= bytes_left;
} else
readlen = 0;
kfree(lvb_buf);
*ppos = *ppos + readlen;
mlog(0, "read %zd bytes\n", readlen);
return readlen;
}
static ssize_t dlmfs_file_write(struct file *filp,
const char __user *buf,
size_t count,
loff_t *ppos)
{
int bytes_left;
ssize_t writelen;
char *lvb_buf;
struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
if (*ppos >= i_size_read(inode))
return -ENOSPC;
if (!count)
return 0;
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
/* don't write past the lvb */
if ((count + *ppos) > i_size_read(inode))
writelen = i_size_read(inode) - *ppos;
else
writelen = count - *ppos;
lvb_buf = kmalloc(writelen, GFP_NOFS);
if (!lvb_buf)
return -ENOMEM;
bytes_left = copy_from_user(lvb_buf, buf, writelen);
writelen -= bytes_left;
if (writelen)
user_dlm_write_lvb(inode, lvb_buf, writelen);
kfree(lvb_buf);
*ppos = *ppos + writelen;
mlog(0, "wrote %zd bytes\n", writelen);
return writelen;
}
static void dlmfs_init_once(void *foo)
{
struct dlmfs_inode_private *ip =
(struct dlmfs_inode_private *) foo;
ip->ip_conn = NULL;
ip->ip_parent = NULL;
inode_init_once(&ip->ip_vfs_inode);
}
static struct inode *dlmfs_alloc_inode(struct super_block *sb)
{
struct dlmfs_inode_private *ip;
ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
if (!ip)
return NULL;
return &ip->ip_vfs_inode;
}
static void dlmfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
}
static void dlmfs_destroy_inode(struct inode *inode)
{
call_rcu(&inode->i_rcu, dlmfs_i_callback);
}
static void dlmfs_evict_inode(struct inode *inode)
{
int status;
struct dlmfs_inode_private *ip;
clear_inode(inode);
mlog(0, "inode %lu\n", inode->i_ino);
ip = DLMFS_I(inode);
if (S_ISREG(inode->i_mode)) {
status = user_dlm_destroy_lock(&ip->ip_lockres);
if (status < 0)
mlog_errno(status);
iput(ip->ip_parent);
goto clear_fields;
}
mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
/* we must be a directory. If required, lets unregister the
* dlm context now. */
if (ip->ip_conn)
user_dlm_unregister(ip->ip_conn);
clear_fields:
ip->ip_parent = NULL;
ip->ip_conn = NULL;
}
static struct backing_dev_info dlmfs_backing_dev_info = {
.name = "ocfs2-dlmfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
static struct inode *dlmfs_get_root_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
umode_t mode = S_IFDIR | 0755;
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(inode, NULL, mode);
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inc_nlink(inode);
inode->i_fop = &simple_dir_operations;
inode->i_op = &dlmfs_root_inode_operations;
}
return inode;
}
static struct inode *dlmfs_get_inode(struct inode *parent,
struct dentry *dentry,
umode_t mode)
{
struct super_block *sb = parent->i_sb;
struct inode * inode = new_inode(sb);
struct dlmfs_inode_private *ip;
if (!inode)
return NULL;
inode->i_ino = get_next_ino();
inode_init_owner(inode, parent, mode);
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ip = DLMFS_I(inode);
ip->ip_conn = DLMFS_I(parent)->ip_conn;
switch (mode & S_IFMT) {
default:
/* for now we don't support anything other than
* directories and regular files. */
BUG();
break;
case S_IFREG:
inode->i_op = &dlmfs_file_inode_operations;
inode->i_fop = &dlmfs_file_operations;
i_size_write(inode, DLM_LVB_LEN);
user_dlm_lock_res_init(&ip->ip_lockres, dentry);
/* released at clear_inode time, this insures that we
* get to drop the dlm reference on each lock *before*
* we call the unregister code for releasing parent
* directories. */
ip->ip_parent = igrab(parent);
BUG_ON(!ip->ip_parent);
break;
case S_IFDIR:
inode->i_op = &dlmfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink ==
* 2 (for "." entry) */
inc_nlink(inode);
break;
}
return inode;
}
/*
* File creation. Allocate an inode, and we're done..
*/
/* SMP-safe */
static int dlmfs_mkdir(struct inode * dir,
struct dentry * dentry,
umode_t mode)
{
int status;
struct inode *inode = NULL;
struct qstr *domain = &dentry->d_name;
struct dlmfs_inode_private *ip;
struct ocfs2_cluster_connection *conn;
mlog(0, "mkdir %.*s\n", domain->len, domain->name);
/* verify that we have a proper domain */
if (domain->len >= GROUP_NAME_MAX) {
status = -EINVAL;
mlog(ML_ERROR, "invalid domain name for directory.\n");
goto bail;
}
inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
if (!inode) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
ip = DLMFS_I(inode);
conn = user_dlm_register(domain);
if (IS_ERR(conn)) {
status = PTR_ERR(conn);
mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
status, domain->len, domain->name);
goto bail;
}
ip->ip_conn = conn;
inc_nlink(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
status = 0;
bail:
if (status < 0)
iput(inode);
return status;
}
static int dlmfs_create(struct inode *dir,
struct dentry *dentry,
umode_t mode,
bool excl)
{
int status = 0;
struct inode *inode;
struct qstr *name = &dentry->d_name;
mlog(0, "create %.*s\n", name->len, name->name);
/* verify name is valid and doesn't contain any dlm reserved
* characters */
if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
name->name[0] == '$') {
status = -EINVAL;
mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
name->name);
goto bail;
}
inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
if (!inode) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
bail:
return status;
}
static int dlmfs_unlink(struct inode *dir,
struct dentry *dentry)
{
int status;
struct inode *inode = dentry->d_inode;
mlog(0, "unlink inode %lu\n", inode->i_ino);
/* if there are no current holders, or none that are waiting
* to acquire a lock, this basically destroys our lockres. */
status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
if (status < 0) {
mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
dentry->d_name.len, dentry->d_name.name, status);
goto bail;
}
status = simple_unlink(dir, dentry);
bail:
return status;
}
static int dlmfs_fill_super(struct super_block * sb,
void * data,
int silent)
{
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = DLMFS_MAGIC;
sb->s_op = &dlmfs_ops;
sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
if (!sb->s_root)
return -ENOMEM;
return 0;
}
static const struct file_operations dlmfs_file_operations = {
.open = dlmfs_file_open,
.release = dlmfs_file_release,
.poll = dlmfs_file_poll,
.read = dlmfs_file_read,
.write = dlmfs_file_write,
.llseek = default_llseek,
};
static const struct inode_operations dlmfs_dir_inode_operations = {
.create = dlmfs_create,
.lookup = simple_lookup,
.unlink = dlmfs_unlink,
};
/* this way we can restrict mkdir to only the toplevel of the fs. */
static const struct inode_operations dlmfs_root_inode_operations = {
.lookup = simple_lookup,
.mkdir = dlmfs_mkdir,
.rmdir = simple_rmdir,
};
static const struct super_operations dlmfs_ops = {
.statfs = simple_statfs,
.alloc_inode = dlmfs_alloc_inode,
.destroy_inode = dlmfs_destroy_inode,
.evict_inode = dlmfs_evict_inode,
.drop_inode = generic_delete_inode,
};
static const struct inode_operations dlmfs_file_inode_operations = {
.getattr = simple_getattr,
.setattr = dlmfs_file_setattr,
};
static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
}
static struct file_system_type dlmfs_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2_dlmfs",
.mount = dlmfs_mount,
.kill_sb = kill_litter_super,
};
MODULE_ALIAS_FS("ocfs2_dlmfs");
static int __init init_dlmfs_fs(void)
{
int status;
int cleanup_inode = 0, cleanup_worker = 0;
status = bdi_init(&dlmfs_backing_dev_info);
if (status)
return status;
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
dlmfs_init_once);
if (!dlmfs_inode_cache) {
status = -ENOMEM;
goto bail;
}
cleanup_inode = 1;
user_dlm_worker = create_singlethread_workqueue("user_dlm");
if (!user_dlm_worker) {
status = -ENOMEM;
goto bail;
}
cleanup_worker = 1;
user_dlm_set_locking_protocol();
status = register_filesystem(&dlmfs_fs_type);
bail:
if (status) {
if (cleanup_inode)
kmem_cache_destroy(dlmfs_inode_cache);
if (cleanup_worker)
destroy_workqueue(user_dlm_worker);
bdi_destroy(&dlmfs_backing_dev_info);
} else
printk("OCFS2 User DLM kernel interface loaded\n");
return status;
}
static void __exit exit_dlmfs_fs(void)
{
unregister_filesystem(&dlmfs_fs_type);
flush_workqueue(user_dlm_worker);
destroy_workqueue(user_dlm_worker);
/*
* Make sure all delayed rcu free inodes are flushed before we
* destroy cache.
*/
rcu_barrier();
kmem_cache_destroy(dlmfs_inode_cache);
bdi_destroy(&dlmfs_backing_dev_info);
}
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
module_init(init_dlmfs_fs)
module_exit(exit_dlmfs_fs)

688
fs/ocfs2/dlmfs/userdlm.c Normal file
View file

@ -0,0 +1,688 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* userdlm.c
*
* Code which implements the kernel side of a minimal userspace
* interface to our DLM.
*
* Many of the functions here are pared down versions of dlmglue.c
* functions.
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/signal.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/crc32.h>
#include "ocfs2_lockingver.h"
#include "stackglue.h"
#include "userdlm.h"
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
{
return container_of(lksb, struct user_lock_res, l_lksb);
}
static inline int user_check_wait_flag(struct user_lock_res *lockres,
int flag)
{
int ret;
spin_lock(&lockres->l_lock);
ret = lockres->l_flags & flag;
spin_unlock(&lockres->l_lock);
return ret;
}
static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
{
wait_event(lockres->l_event,
!user_check_wait_flag(lockres, USER_LOCK_BUSY));
}
static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
{
wait_event(lockres->l_event,
!user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
}
/* I heart container_of... */
static inline struct ocfs2_cluster_connection *
cluster_connection_from_user_lockres(struct user_lock_res *lockres)
{
struct dlmfs_inode_private *ip;
ip = container_of(lockres,
struct dlmfs_inode_private,
ip_lockres);
return ip->ip_conn;
}
static struct inode *
user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
{
struct dlmfs_inode_private *ip;
ip = container_of(lockres,
struct dlmfs_inode_private,
ip_lockres);
return &ip->ip_vfs_inode;
}
static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
{
spin_lock(&lockres->l_lock);
lockres->l_flags &= ~USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
}
#define user_log_dlm_error(_func, _stat, _lockres) do { \
mlog(ML_ERROR, "Dlm error %d while calling %s on " \
"resource %.*s\n", _stat, _func, \
_lockres->l_namelen, _lockres->l_name); \
} while (0)
/* WARNING: This function lives in a world where the only three lock
* levels are EX, PR, and NL. It *will* have to be adjusted when more
* lock types are added. */
static inline int user_highest_compat_lock_level(int level)
{
int new_level = DLM_LOCK_EX;
if (level == DLM_LOCK_EX)
new_level = DLM_LOCK_NL;
else if (level == DLM_LOCK_PR)
new_level = DLM_LOCK_PR;
return new_level;
}
static void user_ast(struct ocfs2_dlm_lksb *lksb)
{
struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
int status;
mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
lockres->l_namelen, lockres->l_name, lockres->l_level,
lockres->l_requested);
spin_lock(&lockres->l_lock);
status = ocfs2_dlm_lock_status(&lockres->l_lksb);
if (status) {
mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
status, lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
return;
}
mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
"Lockres %.*s, requested ivmode. flags 0x%x\n",
lockres->l_namelen, lockres->l_name, lockres->l_flags);
/* we're downconverting. */
if (lockres->l_requested < lockres->l_level) {
if (lockres->l_requested <=
user_highest_compat_lock_level(lockres->l_blocking)) {
lockres->l_blocking = DLM_LOCK_NL;
lockres->l_flags &= ~USER_LOCK_BLOCKED;
}
}
lockres->l_level = lockres->l_requested;
lockres->l_requested = DLM_LOCK_IV;
lockres->l_flags |= USER_LOCK_ATTACHED;
lockres->l_flags &= ~USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
wake_up(&lockres->l_event);
}
static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
{
struct inode *inode;
inode = user_dlm_inode_from_user_lockres(lockres);
if (!igrab(inode))
BUG();
}
static void user_dlm_unblock_lock(struct work_struct *work);
static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
{
if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
user_dlm_grab_inode_ref(lockres);
INIT_WORK(&lockres->l_work, user_dlm_unblock_lock);
queue_work(user_dlm_worker, &lockres->l_work);
lockres->l_flags |= USER_LOCK_QUEUED;
}
}
static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
{
int queue = 0;
if (!(lockres->l_flags & USER_LOCK_BLOCKED))
return;
switch (lockres->l_blocking) {
case DLM_LOCK_EX:
if (!lockres->l_ex_holders && !lockres->l_ro_holders)
queue = 1;
break;
case DLM_LOCK_PR:
if (!lockres->l_ex_holders)
queue = 1;
break;
default:
BUG();
}
if (queue)
__user_dlm_queue_lockres(lockres);
}
static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
{
struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
lockres->l_namelen, lockres->l_name, level, lockres->l_level);
spin_lock(&lockres->l_lock);
lockres->l_flags |= USER_LOCK_BLOCKED;
if (level > lockres->l_blocking)
lockres->l_blocking = level;
__user_dlm_queue_lockres(lockres);
spin_unlock(&lockres->l_lock);
wake_up(&lockres->l_event);
}
static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
{
struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
lockres->l_namelen, lockres->l_name, lockres->l_flags);
if (status)
mlog(ML_ERROR, "dlm returns status %d\n", status);
spin_lock(&lockres->l_lock);
/* The teardown flag gets set early during the unlock process,
* so test the cancel flag to make sure that this ast isn't
* for a concurrent cancel. */
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
&& !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
lockres->l_level = DLM_LOCK_IV;
} else if (status == DLM_CANCELGRANT) {
/* We tried to cancel a convert request, but it was
* already granted. Don't clear the busy flag - the
* ast should've done this already. */
BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
goto out_noclear;
} else {
BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
/* Cancel succeeded, we want to re-queue */
lockres->l_requested = DLM_LOCK_IV; /* cancel an
* upconvert
* request. */
lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
/* we want the unblock thread to look at it again
* now. */
if (lockres->l_flags & USER_LOCK_BLOCKED)
__user_dlm_queue_lockres(lockres);
}
lockres->l_flags &= ~USER_LOCK_BUSY;
out_noclear:
spin_unlock(&lockres->l_lock);
wake_up(&lockres->l_event);
}
/*
* This is the userdlmfs locking protocol version.
*
* See fs/ocfs2/dlmglue.c for more details on locking versions.
*/
static struct ocfs2_locking_protocol user_dlm_lproto = {
.lp_max_version = {
.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
},
.lp_lock_ast = user_ast,
.lp_blocking_ast = user_bast,
.lp_unlock_ast = user_unlock_ast,
};
static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
{
struct inode *inode;
inode = user_dlm_inode_from_user_lockres(lockres);
iput(inode);
}
static void user_dlm_unblock_lock(struct work_struct *work)
{
int new_level, status;
struct user_lock_res *lockres =
container_of(work, struct user_lock_res, l_work);
struct ocfs2_cluster_connection *conn =
cluster_connection_from_user_lockres(lockres);
mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
spin_lock(&lockres->l_lock);
mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
"Lockres %.*s, flags 0x%x\n",
lockres->l_namelen, lockres->l_name, lockres->l_flags);
/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
* set, we want user_ast clear it. */
lockres->l_flags &= ~USER_LOCK_QUEUED;
/* It's valid to get here and no longer be blocked - if we get
* several basts in a row, we might be queued by the first
* one, the unblock thread might run and clear the queued
* flag, and finally we might get another bast which re-queues
* us before our ast for the downconvert is called. */
if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
if (lockres->l_flags & USER_LOCK_BUSY) {
if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
lockres->l_flags |= USER_LOCK_IN_CANCEL;
spin_unlock(&lockres->l_lock);
status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
DLM_LKF_CANCEL);
if (status)
user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
goto drop_ref;
}
/* If there are still incompat holders, we can exit safely
* without worrying about re-queueing this lock as that will
* happen on the last call to user_cluster_unlock. */
if ((lockres->l_blocking == DLM_LOCK_EX)
&& (lockres->l_ex_holders || lockres->l_ro_holders)) {
spin_unlock(&lockres->l_lock);
mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
lockres->l_namelen, lockres->l_name,
lockres->l_ex_holders, lockres->l_ro_holders);
goto drop_ref;
}
if ((lockres->l_blocking == DLM_LOCK_PR)
&& lockres->l_ex_holders) {
spin_unlock(&lockres->l_lock);
mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
lockres->l_namelen, lockres->l_name,
lockres->l_ex_holders);
goto drop_ref;
}
/* yay, we can downconvert now. */
new_level = user_highest_compat_lock_level(lockres->l_blocking);
lockres->l_requested = new_level;
lockres->l_flags |= USER_LOCK_BUSY;
mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
spin_unlock(&lockres->l_lock);
/* need lock downconvert request now... */
status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
DLM_LKF_CONVERT|DLM_LKF_VALBLK,
lockres->l_name,
lockres->l_namelen);
if (status) {
user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
user_recover_from_dlm_error(lockres);
}
drop_ref:
user_dlm_drop_inode_ref(lockres);
}
static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
int level)
{
switch(level) {
case DLM_LOCK_EX:
lockres->l_ex_holders++;
break;
case DLM_LOCK_PR:
lockres->l_ro_holders++;
break;
default:
BUG();
}
}
/* predict what lock level we'll be dropping down to on behalf
* of another node, and return true if the currently wanted
* level will be compatible with it. */
static inline int
user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
int wanted)
{
BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
}
int user_dlm_cluster_lock(struct user_lock_res *lockres,
int level,
int lkm_flags)
{
int status, local_flags;
struct ocfs2_cluster_connection *conn =
cluster_connection_from_user_lockres(lockres);
if (level != DLM_LOCK_EX &&
level != DLM_LOCK_PR) {
mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
lockres->l_namelen, lockres->l_name);
status = -EINVAL;
goto bail;
}
mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
lockres->l_namelen, lockres->l_name, level, lkm_flags);
again:
if (signal_pending(current)) {
status = -ERESTARTSYS;
goto bail;
}
spin_lock(&lockres->l_lock);
/* We only compare against the currently granted level
* here. If the lock is blocked waiting on a downconvert,
* we'll get caught below. */
if ((lockres->l_flags & USER_LOCK_BUSY) &&
(level > lockres->l_level)) {
/* is someone sitting in dlm_lock? If so, wait on
* them. */
spin_unlock(&lockres->l_lock);
user_wait_on_busy_lock(lockres);
goto again;
}
if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
(!user_may_continue_on_blocked_lock(lockres, level))) {
/* is the lock is currently blocked on behalf of
* another node */
spin_unlock(&lockres->l_lock);
user_wait_on_blocked_lock(lockres);
goto again;
}
if (level > lockres->l_level) {
local_flags = lkm_flags | DLM_LKF_VALBLK;
if (lockres->l_level != DLM_LOCK_IV)
local_flags |= DLM_LKF_CONVERT;
lockres->l_requested = level;
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
BUG_ON(level == DLM_LOCK_IV);
BUG_ON(level == DLM_LOCK_NL);
/* call dlm_lock to upgrade lock now */
status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
local_flags, lockres->l_name,
lockres->l_namelen);
if (status) {
if ((lkm_flags & DLM_LKF_NOQUEUE) &&
(status != -EAGAIN))
user_log_dlm_error("ocfs2_dlm_lock",
status, lockres);
user_recover_from_dlm_error(lockres);
goto bail;
}
user_wait_on_busy_lock(lockres);
goto again;
}
user_dlm_inc_holders(lockres, level);
spin_unlock(&lockres->l_lock);
status = 0;
bail:
return status;
}
static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
int level)
{
switch(level) {
case DLM_LOCK_EX:
BUG_ON(!lockres->l_ex_holders);
lockres->l_ex_holders--;
break;
case DLM_LOCK_PR:
BUG_ON(!lockres->l_ro_holders);
lockres->l_ro_holders--;
break;
default:
BUG();
}
}
void user_dlm_cluster_unlock(struct user_lock_res *lockres,
int level)
{
if (level != DLM_LOCK_EX &&
level != DLM_LOCK_PR) {
mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
lockres->l_namelen, lockres->l_name);
return;
}
spin_lock(&lockres->l_lock);
user_dlm_dec_holders(lockres, level);
__user_dlm_cond_queue_lockres(lockres);
spin_unlock(&lockres->l_lock);
}
void user_dlm_write_lvb(struct inode *inode,
const char *val,
unsigned int len)
{
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
char *lvb;
BUG_ON(len > DLM_LVB_LEN);
spin_lock(&lockres->l_lock);
BUG_ON(lockres->l_level < DLM_LOCK_EX);
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
memcpy(lvb, val, len);
spin_unlock(&lockres->l_lock);
}
ssize_t user_dlm_read_lvb(struct inode *inode,
char *val,
unsigned int len)
{
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
char *lvb;
ssize_t ret = len;
BUG_ON(len > DLM_LVB_LEN);
spin_lock(&lockres->l_lock);
BUG_ON(lockres->l_level < DLM_LOCK_PR);
if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
memcpy(val, lvb, len);
} else
ret = 0;
spin_unlock(&lockres->l_lock);
return ret;
}
void user_dlm_lock_res_init(struct user_lock_res *lockres,
struct dentry *dentry)
{
memset(lockres, 0, sizeof(*lockres));
spin_lock_init(&lockres->l_lock);
init_waitqueue_head(&lockres->l_event);
lockres->l_level = DLM_LOCK_IV;
lockres->l_requested = DLM_LOCK_IV;
lockres->l_blocking = DLM_LOCK_IV;
/* should have been checked before getting here. */
BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
memcpy(lockres->l_name,
dentry->d_name.name,
dentry->d_name.len);
lockres->l_namelen = dentry->d_name.len;
}
int user_dlm_destroy_lock(struct user_lock_res *lockres)
{
int status = -EBUSY;
struct ocfs2_cluster_connection *conn =
cluster_connection_from_user_lockres(lockres);
mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
spin_lock(&lockres->l_lock);
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
spin_unlock(&lockres->l_lock);
return 0;
}
lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
while (lockres->l_flags & USER_LOCK_BUSY) {
spin_unlock(&lockres->l_lock);
user_wait_on_busy_lock(lockres);
spin_lock(&lockres->l_lock);
}
if (lockres->l_ro_holders || lockres->l_ex_holders) {
spin_unlock(&lockres->l_lock);
goto bail;
}
status = 0;
if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
spin_unlock(&lockres->l_lock);
goto bail;
}
lockres->l_flags &= ~USER_LOCK_ATTACHED;
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
if (status) {
user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
goto bail;
}
user_wait_on_busy_lock(lockres);
status = 0;
bail:
return status;
}
static void user_dlm_recovery_handler_noop(int node_num,
void *recovery_data)
{
/* We ignore recovery events */
return;
}
void user_dlm_set_locking_protocol(void)
{
ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
}
struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
{
int rc;
struct ocfs2_cluster_connection *conn;
rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
&user_dlm_lproto,
user_dlm_recovery_handler_noop,
NULL, &conn);
if (rc)
mlog_errno(rc);
return rc ? ERR_PTR(rc) : conn;
}
void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
{
ocfs2_cluster_disconnect(conn, 0);
}

113
fs/ocfs2/dlmfs/userdlm.h Normal file
View file

@ -0,0 +1,113 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* userdlm.h
*
* Userspace dlm defines
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef USERDLM_H
#define USERDLM_H
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/workqueue.h>
/* user_lock_res->l_flags flags. */
#define USER_LOCK_ATTACHED (0x00000001) /* we have initialized
* the lvb */
#define USER_LOCK_BUSY (0x00000002) /* we are currently in
* dlm_lock */
#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to
* downconvert*/
#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently
* destroying this
* lock. */
#define USER_LOCK_QUEUED (0x00000010) /* lock is on the
* workqueue */
#define USER_LOCK_IN_CANCEL (0x00000020)
struct user_lock_res {
spinlock_t l_lock;
int l_flags;
#define USER_DLM_LOCK_ID_MAX_LEN 32
char l_name[USER_DLM_LOCK_ID_MAX_LEN];
int l_namelen;
int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
struct ocfs2_dlm_lksb l_lksb;
int l_requested;
int l_blocking;
wait_queue_head_t l_event;
struct work_struct l_work;
};
extern struct workqueue_struct *user_dlm_worker;
void user_dlm_lock_res_init(struct user_lock_res *lockres,
struct dentry *dentry);
int user_dlm_destroy_lock(struct user_lock_res *lockres);
int user_dlm_cluster_lock(struct user_lock_res *lockres,
int level,
int lkm_flags);
void user_dlm_cluster_unlock(struct user_lock_res *lockres,
int level);
void user_dlm_write_lvb(struct inode *inode,
const char *val,
unsigned int len);
ssize_t user_dlm_read_lvb(struct inode *inode,
char *val,
unsigned int len);
struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
void user_dlm_set_locking_protocol(void);
struct dlmfs_inode_private {
struct ocfs2_cluster_connection *ip_conn;
struct user_lock_res ip_lockres; /* unused for directories. */
struct inode *ip_parent;
struct inode ip_vfs_inode;
};
static inline struct dlmfs_inode_private *
DLMFS_I(struct inode *inode)
{
return container_of(inode,
struct dlmfs_inode_private,
ip_vfs_inode);
}
struct dlmfs_filp_private {
int fp_lock_level;
};
#define DLMFS_MAGIC 0x76a9f425
#endif /* USERDLM_H */

4074
fs/ocfs2/dlmglue.c Normal file

File diff suppressed because it is too large Load diff

173
fs/ocfs2/dlmglue.h Normal file
View file

@ -0,0 +1,173 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmglue.h
*
* description here
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef DLMGLUE_H
#define DLMGLUE_H
#include "dcache.h"
#define OCFS2_LVB_VERSION 5
struct ocfs2_meta_lvb {
__u8 lvb_version;
__u8 lvb_reserved0;
__be16 lvb_idynfeatures;
__be32 lvb_iclusters;
__be32 lvb_iuid;
__be32 lvb_igid;
__be64 lvb_iatime_packed;
__be64 lvb_ictime_packed;
__be64 lvb_imtime_packed;
__be64 lvb_isize;
__be16 lvb_imode;
__be16 lvb_inlink;
__be32 lvb_iattr;
__be32 lvb_igeneration;
__be32 lvb_reserved2;
};
#define OCFS2_QINFO_LVB_VERSION 1
struct ocfs2_qinfo_lvb {
__u8 lvb_version;
__u8 lvb_reserved[3];
__be32 lvb_bgrace;
__be32 lvb_igrace;
__be32 lvb_syncms;
__be32 lvb_blocks;
__be32 lvb_free_blk;
__be32 lvb_free_entry;
};
#define OCFS2_ORPHAN_LVB_VERSION 1
struct ocfs2_orphan_scan_lvb {
__u8 lvb_version;
__u8 lvb_reserved[3];
__be32 lvb_os_seqno;
};
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
/* Instruct the dlm not to queue ourselves on the other node. */
#define OCFS2_META_LOCK_NOQUEUE (0x02)
/* don't block waiting for the downconvert thread, instead return -EAGAIN */
#define OCFS2_LOCK_NONBLOCK (0x04)
/* Locking subclasses of inode cluster lock */
enum {
OI_LS_NORMAL = 0,
OI_LS_PARENT,
OI_LS_RENAME1,
OI_LS_RENAME2,
OI_LS_REFLINK_TARGET,
};
int ocfs2_dlm_init(struct ocfs2_super *osb);
void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
unsigned int generation,
struct inode *inode);
void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
u64 parent, struct inode *inode);
struct ocfs2_file_private;
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp);
struct ocfs2_mem_dqinfo;
void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_mem_dqinfo *info);
void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_super *osb, u64 ref_blkno,
unsigned int generation);
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
int ocfs2_open_lock(struct inode *inode);
int ocfs2_try_open_lock(struct inode *inode, int write);
void ocfs2_open_unlock(struct inode *inode);
int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);
int ocfs2_inode_lock_full_nested(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
int arg_flags,
int subclass);
int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct page *page);
/* Variants without special locking class or flags */
#define ocfs2_inode_lock_full(i, r, e, f)\
ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
#define ocfs2_inode_lock_nested(i, b, e, s)\
ocfs2_inode_lock_full_nested(i, b, e, 0, s)
/* 99% of the time we don't want to supply any additional flags --
* those are for very specific cases only. */
#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
void ocfs2_inode_unlock(struct inode *inode,
int ex);
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex);
int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno);
void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno);
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
int ocfs2_file_lock(struct file *file, int ex, int trylock);
void ocfs2_file_unlock(struct file *file);
int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
struct ocfs2_refcount_tree;
int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
/* for the downconvert thread */
void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
/* To set the locking protocol on module initialization */
void ocfs2_set_locking_protocol(void);
#endif /* DLMGLUE_H */

271
fs/ocfs2/export.c Normal file
View file

@ -0,0 +1,271 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* export.c
*
* Functions to facilitate NFS exporting
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dir.h"
#include "dlmglue.h"
#include "dcache.h"
#include "export.h"
#include "inode.h"
#include "buffer_head_io.h"
#include "suballoc.h"
#include "ocfs2_trace.h"
struct ocfs2_inode_handle
{
u64 ih_blkno;
u32 ih_generation;
};
static struct dentry *ocfs2_get_dentry(struct super_block *sb,
struct ocfs2_inode_handle *handle)
{
struct inode *inode;
struct ocfs2_super *osb = OCFS2_SB(sb);
u64 blkno = handle->ih_blkno;
int status, set;
struct dentry *result;
trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno);
if (blkno == 0) {
result = ERR_PTR(-ESTALE);
goto bail;
}
inode = ocfs2_ilookup(sb, blkno);
/*
* If the inode exists in memory, we only need to check it's
* generation number
*/
if (inode)
goto check_gen;
/*
* This will synchronize us against ocfs2_delete_inode() on
* all nodes
*/
status = ocfs2_nfs_sync_lock(osb, 1);
if (status < 0) {
mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
goto check_err;
}
status = ocfs2_test_inode_bit(osb, blkno, &set);
trace_ocfs2_get_dentry_test_bit(status, set);
if (status < 0) {
if (status == -EINVAL) {
/*
* The blkno NFS gave us doesn't even show up
* as an inode, we return -ESTALE to be
* nice
*/
status = -ESTALE;
} else
mlog(ML_ERROR, "test inode bit failed %d\n", status);
goto unlock_nfs_sync;
}
/* If the inode allocator bit is clear, this inode must be stale */
if (!set) {
status = -ESTALE;
goto unlock_nfs_sync;
}
inode = ocfs2_iget(osb, blkno, 0, 0);
unlock_nfs_sync:
ocfs2_nfs_sync_unlock(osb, 1);
check_err:
if (status < 0) {
if (status == -ESTALE) {
trace_ocfs2_get_dentry_stale((unsigned long long)blkno,
handle->ih_generation);
}
result = ERR_PTR(status);
goto bail;
}
if (IS_ERR(inode)) {
mlog_errno(PTR_ERR(inode));
result = (void *)inode;
goto bail;
}
check_gen:
if (handle->ih_generation != inode->i_generation) {
iput(inode);
trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
handle->ih_generation,
inode->i_generation);
result = ERR_PTR(-ESTALE);
goto bail;
}
result = d_obtain_alias(inode);
if (IS_ERR(result))
mlog_errno(PTR_ERR(result));
bail:
trace_ocfs2_get_dentry_end(result);
return result;
}
static struct dentry *ocfs2_get_parent(struct dentry *child)
{
int status;
u64 blkno;
struct dentry *parent;
struct inode *dir = child->d_inode;
trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
(unsigned long long)OCFS2_I(dir)->ip_blkno);
status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
parent = ERR_PTR(status);
goto bail;
}
status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
if (status < 0) {
parent = ERR_PTR(-ENOENT);
goto bail_unlock;
}
parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
bail_unlock:
ocfs2_inode_unlock(dir, 0);
bail:
trace_ocfs2_get_parent_end(parent);
return parent;
}
static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
struct inode *parent)
{
int len = *max_len;
int type = 1;
u64 blkno;
u32 generation;
__le32 *fh = (__force __le32 *) fh_in;
#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION
#error "You go ahead and fix that mess, then. Somehow"
trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
dentry->d_name.name,
fh, len, connectable);
#endif
if (parent && (len < 6)) {
*max_len = 6;
type = FILEID_INVALID;
goto bail;
} else if (len < 3) {
*max_len = 3;
type = FILEID_INVALID;
goto bail;
}
blkno = OCFS2_I(inode)->ip_blkno;
generation = inode->i_generation;
trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation);
len = 3;
fh[0] = cpu_to_le32((u32)(blkno >> 32));
fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[2] = cpu_to_le32(generation);
if (parent) {
blkno = OCFS2_I(parent)->ip_blkno;
generation = parent->i_generation;
fh[3] = cpu_to_le32((u32)(blkno >> 32));
fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[5] = cpu_to_le32(generation);
len = 6;
type = 2;
trace_ocfs2_encode_fh_parent((unsigned long long)blkno,
generation);
}
*max_len = len;
bail:
trace_ocfs2_encode_fh_type(type);
return type;
}
static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
struct ocfs2_inode_handle handle;
if (fh_len < 3 || fh_type > 2)
return NULL;
handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32;
handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]);
handle.ih_generation = le32_to_cpu(fid->raw[2]);
return ocfs2_get_dentry(sb, &handle);
}
static struct dentry *ocfs2_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
struct ocfs2_inode_handle parent;
if (fh_type != 2 || fh_len < 6)
return NULL;
parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32;
parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]);
parent.ih_generation = le32_to_cpu(fid->raw[5]);
return ocfs2_get_dentry(sb, &parent);
}
const struct export_operations ocfs2_export_ops = {
.encode_fh = ocfs2_encode_fh,
.fh_to_dentry = ocfs2_fh_to_dentry,
.fh_to_parent = ocfs2_fh_to_parent,
.get_parent = ocfs2_get_parent,
};

33
fs/ocfs2/export.h Normal file
View file

@ -0,0 +1,33 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* export.h
*
* Function prototypes
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_EXPORT_H
#define OCFS2_EXPORT_H
#include <linux/exportfs.h>
extern const struct export_operations ocfs2_export_ops;
#endif /* OCFS2_EXPORT_H */

994
fs/ocfs2/extent_map.c Normal file
View file

@ -0,0 +1,994 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* extent_map.c
*
* Block/Cluster mapping functions
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/fiemap.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
#include "super.h"
#include "symlink.h"
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
/*
* The extent caching implementation is intentionally trivial.
*
* We only cache a small number of extents stored directly on the
* inode, so linear order operations are acceptable. If we ever want
* to increase the size of the extent map, then these algorithms must
* get smarter.
*/
void ocfs2_extent_map_init(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
oi->ip_extent_map.em_num_items = 0;
INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
}
static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
unsigned int cpos,
struct ocfs2_extent_map_item **ret_emi)
{
unsigned int range;
struct ocfs2_extent_map_item *emi;
*ret_emi = NULL;
list_for_each_entry(emi, &em->em_list, ei_list) {
range = emi->ei_cpos + emi->ei_clusters;
if (cpos >= emi->ei_cpos && cpos < range) {
list_move(&emi->ei_list, &em->em_list);
*ret_emi = emi;
break;
}
}
}
static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
unsigned int *phys, unsigned int *len,
unsigned int *flags)
{
unsigned int coff;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_extent_map_item *emi;
spin_lock(&oi->ip_lock);
__ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
if (emi) {
coff = cpos - emi->ei_cpos;
*phys = emi->ei_phys + coff;
if (len)
*len = emi->ei_clusters - coff;
if (flags)
*flags = emi->ei_flags;
}
spin_unlock(&oi->ip_lock);
if (emi == NULL)
return -ENOENT;
return 0;
}
/*
* Forget about all clusters equal to or greater than cpos.
*/
void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
{
struct ocfs2_extent_map_item *emi, *n;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_extent_map *em = &oi->ip_extent_map;
LIST_HEAD(tmp_list);
unsigned int range;
spin_lock(&oi->ip_lock);
list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
if (emi->ei_cpos >= cpos) {
/* Full truncate of this record. */
list_move(&emi->ei_list, &tmp_list);
BUG_ON(em->em_num_items == 0);
em->em_num_items--;
continue;
}
range = emi->ei_cpos + emi->ei_clusters;
if (range > cpos) {
/* Partial truncate */
emi->ei_clusters = cpos - emi->ei_cpos;
}
}
spin_unlock(&oi->ip_lock);
list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
list_del(&emi->ei_list);
kfree(emi);
}
}
/*
* Is any part of emi2 contained within emi1
*/
static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
struct ocfs2_extent_map_item *emi2)
{
unsigned int range1, range2;
/*
* Check if logical start of emi2 is inside emi1
*/
range1 = emi1->ei_cpos + emi1->ei_clusters;
if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
return 1;
/*
* Check if logical end of emi2 is inside emi1
*/
range2 = emi2->ei_cpos + emi2->ei_clusters;
if (range2 > emi1->ei_cpos && range2 <= range1)
return 1;
return 0;
}
static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
struct ocfs2_extent_map_item *src)
{
dest->ei_cpos = src->ei_cpos;
dest->ei_phys = src->ei_phys;
dest->ei_clusters = src->ei_clusters;
dest->ei_flags = src->ei_flags;
}
/*
* Try to merge emi with ins. Returns 1 if merge succeeds, zero
* otherwise.
*/
static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
struct ocfs2_extent_map_item *ins)
{
/*
* Handle contiguousness
*/
if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
ins->ei_flags == emi->ei_flags) {
emi->ei_clusters += ins->ei_clusters;
return 1;
} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
(ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
ins->ei_flags == emi->ei_flags) {
emi->ei_phys = ins->ei_phys;
emi->ei_cpos = ins->ei_cpos;
emi->ei_clusters += ins->ei_clusters;
return 1;
}
/*
* Overlapping extents - this shouldn't happen unless we've
* split an extent to change it's flags. That is exceedingly
* rare, so there's no sense in trying to optimize it yet.
*/
if (ocfs2_ei_is_contained(emi, ins) ||
ocfs2_ei_is_contained(ins, emi)) {
ocfs2_copy_emi_fields(emi, ins);
return 1;
}
/* No merge was possible. */
return 0;
}
/*
* In order to reduce complexity on the caller, this insert function
* is intentionally liberal in what it will accept.
*
* The only rule is that the truncate call *must* be used whenever
* records have been deleted. This avoids inserting overlapping
* records with different physical mappings.
*/
void ocfs2_extent_map_insert_rec(struct inode *inode,
struct ocfs2_extent_rec *rec)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_extent_map *em = &oi->ip_extent_map;
struct ocfs2_extent_map_item *emi, *new_emi = NULL;
struct ocfs2_extent_map_item ins;
ins.ei_cpos = le32_to_cpu(rec->e_cpos);
ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
le64_to_cpu(rec->e_blkno));
ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
ins.ei_flags = rec->e_flags;
search:
spin_lock(&oi->ip_lock);
list_for_each_entry(emi, &em->em_list, ei_list) {
if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
list_move(&emi->ei_list, &em->em_list);
spin_unlock(&oi->ip_lock);
goto out;
}
}
/*
* No item could be merged.
*
* Either allocate and add a new item, or overwrite the last recently
* inserted.
*/
if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
if (new_emi == NULL) {
spin_unlock(&oi->ip_lock);
new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
if (new_emi == NULL)
goto out;
goto search;
}
ocfs2_copy_emi_fields(new_emi, &ins);
list_add(&new_emi->ei_list, &em->em_list);
em->em_num_items++;
new_emi = NULL;
} else {
BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
emi = list_entry(em->em_list.prev,
struct ocfs2_extent_map_item, ei_list);
list_move(&emi->ei_list, &em->em_list);
ocfs2_copy_emi_fields(emi, &ins);
}
spin_unlock(&oi->ip_lock);
out:
kfree(new_emi);
}
static int ocfs2_last_eb_is_empty(struct inode *inode,
struct ocfs2_dinode *di)
{
int ret, next_free;
u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
struct buffer_head *eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in "
"leaf block %llu\n", inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
}
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0 ||
(next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
ret = 1;
out:
brelse(eb_bh);
return ret;
}
/*
* Return the 1st index within el which contains an extent start
* larger than v_cluster.
*/
static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
u32 v_cluster)
{
int i;
struct ocfs2_extent_rec *rec;
for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i];
if (v_cluster < le32_to_cpu(rec->e_cpos))
break;
}
return i;
}
/*
* Figure out the size of a hole which starts at v_cluster within the given
* extent list.
*
* If there is no more allocation past v_cluster, we return the maximum
* cluster size minus v_cluster.
*
* If we have in-inode extents, then el points to the dinode list and
* eb_bh is NULL. Otherwise, eb_bh should point to the extent block
* containing el.
*/
int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
struct ocfs2_extent_list *el,
struct buffer_head *eb_bh,
u32 v_cluster,
u32 *num_clusters)
{
int ret, i;
struct buffer_head *next_eb_bh = NULL;
struct ocfs2_extent_block *eb, *next_eb;
i = ocfs2_search_for_hole_index(el, v_cluster);
if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
eb = (struct ocfs2_extent_block *)eb_bh->b_data;
/*
* Check the next leaf for any extents.
*/
if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
goto no_more_extents;
ret = ocfs2_read_extent_block(ci,
le64_to_cpu(eb->h_next_leaf_blk),
&next_eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
el = &next_eb->h_list;
i = ocfs2_search_for_hole_index(el, v_cluster);
}
no_more_extents:
if (i == le16_to_cpu(el->l_next_free_rec)) {
/*
* We're at the end of our existing allocation. Just
* return the maximum number of clusters we could
* possibly allocate.
*/
*num_clusters = UINT_MAX - v_cluster;
} else {
*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
}
ret = 0;
out:
brelse(next_eb_bh);
return ret;
}
static int ocfs2_get_clusters_nocache(struct inode *inode,
struct buffer_head *di_bh,
u32 v_cluster, unsigned int *hole_len,
struct ocfs2_extent_rec *ret_rec,
unsigned int *is_last)
{
int i, ret, tree_height, len;
struct ocfs2_dinode *di;
struct ocfs2_extent_block *uninitialized_var(eb);
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
struct buffer_head *eb_bh = NULL;
memset(ret_rec, 0, sizeof(*ret_rec));
if (is_last)
*is_last = 0;
di = (struct ocfs2_dinode *) di_bh->b_data;
el = &di->id2.i_list;
tree_height = le16_to_cpu(el->l_tree_depth);
if (tree_height > 0) {
ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
&eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in "
"leaf block %llu\n", inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
}
}
i = ocfs2_search_extent_list(el, v_cluster);
if (i == -1) {
/*
* Holes can be larger than the maximum size of an
* extent, so we return their lengths in a separate
* field.
*/
if (hole_len) {
ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
el, eb_bh,
v_cluster, &len);
if (ret) {
mlog_errno(ret);
goto out;
}
*hole_len = len;
}
goto out_hole;
}
rec = &el->l_recs[i];
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
"record (%u, %u, 0)", inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
goto out;
}
*ret_rec = *rec;
/*
* Checking for last extent is potentially expensive - we
* might have to look at the next leaf over to see if it's
* empty.
*
* The first two checks are to see whether the caller even
* cares for this information, and if the extent is at least
* the last in it's list.
*
* If those hold true, then the extent is last if any of the
* additional conditions hold true:
* - Extent list is in-inode
* - Extent list is right-most
* - Extent list is 2nd to rightmost, with empty right-most
*/
if (is_last) {
if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
if (tree_height == 0)
*is_last = 1;
else if (eb->h_blkno == di->i_last_eb_blk)
*is_last = 1;
else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
ret = ocfs2_last_eb_is_empty(inode, di);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
if (ret == 1)
*is_last = 1;
}
}
}
out_hole:
ret = 0;
out:
brelse(eb_bh);
return ret;
}
static void ocfs2_relative_extent_offsets(struct super_block *sb,
u32 v_cluster,
struct ocfs2_extent_rec *rec,
u32 *p_cluster, u32 *num_clusters)
{
u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
*p_cluster = *p_cluster + coff;
if (num_clusters)
*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
}
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
struct ocfs2_extent_list *el,
unsigned int *extent_flags)
{
int ret = 0, i;
struct buffer_head *eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_rec *rec;
u32 coff;
if (el->l_tree_depth) {
ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
&eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in "
"xattr leaf block %llu\n", inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
}
}
i = ocfs2_search_extent_list(el, v_cluster);
if (i == -1) {
ret = -EROFS;
mlog_errno(ret);
goto out;
} else {
rec = &el->l_recs[i];
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
"record (%u, %u, 0) in xattr", inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
goto out;
}
coff = v_cluster - le32_to_cpu(rec->e_cpos);
*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
le64_to_cpu(rec->e_blkno));
*p_cluster = *p_cluster + coff;
if (num_clusters)
*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
if (extent_flags)
*extent_flags = rec->e_flags;
}
out:
if (eb_bh)
brelse(eb_bh);
return ret;
}
int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
unsigned int *extent_flags)
{
int ret;
unsigned int uninitialized_var(hole_len), flags = 0;
struct buffer_head *di_bh = NULL;
struct ocfs2_extent_rec rec;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ret = -ERANGE;
mlog_errno(ret);
goto out;
}
ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
num_clusters, extent_flags);
if (ret == 0)
goto out;
ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
&rec, NULL);
if (ret) {
mlog_errno(ret);
goto out;
}
if (rec.e_blkno == 0ULL) {
/*
* A hole was found. Return some canned values that
* callers can key on. If asked for, num_clusters will
* be populated with the size of the hole.
*/
*p_cluster = 0;
if (num_clusters) {
*num_clusters = hole_len;
}
} else {
ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
p_cluster, num_clusters);
flags = rec.e_flags;
ocfs2_extent_map_insert_rec(inode, &rec);
}
if (extent_flags)
*extent_flags = flags;
out:
brelse(di_bh);
return ret;
}
/*
* This expects alloc_sem to be held. The allocation cannot change at
* all while the map is in the process of being updated.
*/
int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
u64 *ret_count, unsigned int *extent_flags)
{
int ret;
int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
u32 cpos, num_clusters, p_cluster;
u64 boff = 0;
cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
extent_flags);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* p_cluster == 0 indicates a hole.
*/
if (p_cluster) {
boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
boff += (v_blkno & (u64)(bpc - 1));
}
*p_blkno = boff;
if (ret_count) {
*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
*ret_count -= v_blkno & (u64)(bpc - 1);
}
out:
return ret;
}
/*
* The ocfs2_fiemap_inline() may be a little bit misleading, since
* it not only handles the fiemap for inlined files, but also deals
* with the fast symlink, cause they have no difference for extent
* mapping per se.
*/
static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
struct fiemap_extent_info *fieinfo,
u64 map_start)
{
int ret;
unsigned int id_count;
struct ocfs2_dinode *di;
u64 phys;
u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
di = (struct ocfs2_dinode *)di_bh->b_data;
if (ocfs2_inode_is_fast_symlink(inode))
id_count = ocfs2_fast_symlink_chars(inode->i_sb);
else
id_count = le16_to_cpu(di->id2.i_data.id_count);
if (map_start < id_count) {
phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
if (ocfs2_inode_is_fast_symlink(inode))
phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
else
phys += offsetof(struct ocfs2_dinode,
id2.i_data.id_data);
ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
flags);
if (ret < 0)
return ret;
}
return 0;
}
#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len)
{
int ret, is_last;
u32 mapping_end, cpos;
unsigned int hole_size;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u64 len_bytes, phys_bytes, virt_bytes;
struct buffer_head *di_bh = NULL;
struct ocfs2_extent_rec rec;
ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
if (ret)
return ret;
ret = ocfs2_inode_lock(inode, &di_bh, 0);
if (ret) {
mlog_errno(ret);
goto out;
}
down_read(&OCFS2_I(inode)->ip_alloc_sem);
/*
* Handle inline-data and fast symlink separately.
*/
if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
ocfs2_inode_is_fast_symlink(inode)) {
ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
goto out_unlock;
}
cpos = map_start >> osb->s_clustersize_bits;
mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
map_start + map_len);
is_last = 0;
while (cpos < mapping_end && !is_last) {
u32 fe_flags;
ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
&hole_size, &rec, &is_last);
if (ret) {
mlog_errno(ret);
goto out_unlock;
}
if (rec.e_blkno == 0ULL) {
cpos += hole_size;
continue;
}
fe_flags = 0;
if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
fe_flags |= FIEMAP_EXTENT_SHARED;
if (is_last)
fe_flags |= FIEMAP_EXTENT_LAST;
len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
len_bytes, fe_flags);
if (ret)
break;
cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
}
if (ret > 0)
ret = 0;
out_unlock:
brelse(di_bh);
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_inode_unlock(inode, 0);
out:
return ret;
}
int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int ret;
unsigned int is_last = 0, is_data = 0;
u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
u32 cpos, cend, clen, hole_size;
u64 extoff, extlen;
struct buffer_head *di_bh = NULL;
struct ocfs2_extent_rec rec;
BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
ret = ocfs2_inode_lock(inode, &di_bh, 0);
if (ret) {
mlog_errno(ret);
goto out;
}
down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (*offset >= i_size_read(inode)) {
ret = -ENXIO;
goto out_unlock;
}
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
if (whence == SEEK_HOLE)
*offset = i_size_read(inode);
goto out_unlock;
}
clen = 0;
cpos = *offset >> cs_bits;
cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
while (cpos < cend && !is_last) {
ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
&rec, &is_last);
if (ret) {
mlog_errno(ret);
goto out_unlock;
}
extoff = cpos;
extoff <<= cs_bits;
if (rec.e_blkno == 0ULL) {
clen = hole_size;
is_data = 0;
} else {
clen = le16_to_cpu(rec.e_leaf_clusters) -
(cpos - le32_to_cpu(rec.e_cpos));
is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
}
if ((!is_data && whence == SEEK_HOLE) ||
(is_data && whence == SEEK_DATA)) {
if (extoff > *offset)
*offset = extoff;
goto out_unlock;
}
if (!is_last)
cpos += clen;
}
if (whence == SEEK_HOLE) {
extoff = cpos;
extoff <<= cs_bits;
extlen = clen;
extlen <<= cs_bits;
if ((extoff + extlen) > i_size_read(inode))
extlen = i_size_read(inode) - extoff;
extoff += extlen;
if (extoff > *offset)
*offset = extoff;
goto out_unlock;
}
ret = -ENXIO;
out_unlock:
brelse(di_bh);
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_inode_unlock(inode, 0);
out:
return ret;
}
int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh))
{
int rc = 0;
u64 p_block, p_count;
int i, count, done = 0;
trace_ocfs2_read_virt_blocks(
inode, (unsigned long long)v_block, nr, bhs, flags,
validate);
if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
i_size_read(inode)) {
BUG_ON(!(flags & OCFS2_BH_READAHEAD));
goto out;
}
while (done < nr) {
down_read(&OCFS2_I(inode)->ip_alloc_sem);
rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
&p_block, &p_count, NULL);
up_read(&OCFS2_I(inode)->ip_alloc_sem);
if (rc) {
mlog_errno(rc);
break;
}
if (!p_block) {
rc = -EIO;
mlog(ML_ERROR,
"Inode #%llu contains a hole at offset %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)(v_block + done) <<
inode->i_sb->s_blocksize_bits);
break;
}
count = nr - done;
if (p_count < count)
count = p_count;
/*
* If the caller passed us bhs, they should have come
* from a previous readahead call to this function. Thus,
* they should have the right b_blocknr.
*/
for (i = 0; i < count; i++) {
if (!bhs[done + i])
continue;
BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
}
rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
bhs + done, flags, validate);
if (rc) {
mlog_errno(rc);
break;
}
done += count;
}
out:
return rc;
}

92
fs/ocfs2/extent_map.h Normal file
View file

@ -0,0 +1,92 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* extent_map.h
*
* In-memory file extent mappings for OCFS2.
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef _EXTENT_MAP_H
#define _EXTENT_MAP_H
struct ocfs2_extent_map_item {
unsigned int ei_cpos;
unsigned int ei_phys;
unsigned int ei_clusters;
unsigned int ei_flags;
struct list_head ei_list;
};
#define OCFS2_MAX_EXTENT_MAP_ITEMS 3
struct ocfs2_extent_map {
unsigned int em_num_items;
struct list_head em_list;
};
void ocfs2_extent_map_init(struct inode *inode);
void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
void ocfs2_extent_map_insert_rec(struct inode *inode,
struct ocfs2_extent_rec *rec);
int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
u32 *num_clusters, unsigned int *extent_flags);
int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
u64 *ret_count, unsigned int *extent_flags);
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len);
int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
struct ocfs2_extent_list *el,
unsigned int *extent_flags);
int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh));
int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
struct ocfs2_extent_list *el,
struct buffer_head *eb_bh,
u32 v_cluster,
u32 *num_clusters);
static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
struct buffer_head **bh,
int (*validate)(struct super_block *sb,
struct buffer_head *bh))
{
int status = 0;
if (bh == NULL) {
printk("ocfs2: bh == NULL\n");
status = -EINVAL;
goto bail;
}
status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
bail:
return status;
}
#endif /* _EXTENT_MAP_H */

2701
fs/ocfs2/file.c Normal file

File diff suppressed because it is too large Load diff

76
fs/ocfs2/file.h Normal file
View file

@ -0,0 +1,76 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* file.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_FILE_H
#define OCFS2_FILE_H
extern const struct file_operations ocfs2_fops;
extern const struct file_operations ocfs2_dops;
extern const struct file_operations ocfs2_fops_no_plocks;
extern const struct file_operations ocfs2_dops_no_plocks;
extern const struct inode_operations ocfs2_file_iops;
extern const struct inode_operations ocfs2_special_file_iops;
struct ocfs2_alloc_context;
enum ocfs2_alloc_restarted;
struct ocfs2_file_private {
struct file *fp_file;
struct mutex fp_mutex;
struct ocfs2_lock_res fp_flock;
};
int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct inode *inode,
u32 *logical_offset,
u32 clusters_to_add,
int mark_unwritten,
struct buffer_head *fe_bh,
handle_t *handle,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret);
int ocfs2_simple_size_update(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size);
int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
u64 new_i_size, u64 zero_to);
int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
loff_t zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
int ocfs2_permission(struct inode *inode, int mask);
int ocfs2_should_update_atime(struct inode *inode,
struct vfsmount *vfsmnt);
int ocfs2_update_inode_atime(struct inode *inode,
struct buffer_head *bh);
int ocfs2_change_file_space(struct file *file, unsigned int cmd,
struct ocfs2_space_resv *sr);
int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
size_t count);
#endif /* OCFS2_FILE_H */

134
fs/ocfs2/heartbeat.c Normal file
View file

@ -0,0 +1,134 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* heartbeat.c
*
* Register ourselves with the heartbaet service, keep our node maps
* up to date, and fire off recovery when needed.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
int bit);
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
int bit);
/* special case -1 for now
* TODO: should *really* make sure the calling func never passes -1!! */
static void ocfs2_node_map_init(struct ocfs2_node_map *map)
{
map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
sizeof(unsigned long));
}
void ocfs2_init_node_maps(struct ocfs2_super *osb)
{
spin_lock_init(&osb->node_map_lock);
ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
}
void ocfs2_do_node_down(int node_num, void *data)
{
struct ocfs2_super *osb = data;
BUG_ON(osb->node_num == node_num);
trace_ocfs2_do_node_down(node_num);
if (!osb->cconn) {
/*
* No cluster connection means we're not even ready to
* participate yet. We check the slots after the cluster
* comes up, so we will notice the node death then. We
* can safely ignore it here.
*/
return;
}
ocfs2_recovery_thread(osb, node_num);
}
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
int bit)
{
set_bit(bit, map->map);
}
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
{
if (bit==-1)
return;
BUG_ON(bit >= map->num_nodes);
spin_lock(&osb->node_map_lock);
__ocfs2_node_map_set_bit(map, bit);
spin_unlock(&osb->node_map_lock);
}
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
int bit)
{
clear_bit(bit, map->map);
}
void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
{
if (bit==-1)
return;
BUG_ON(bit >= map->num_nodes);
spin_lock(&osb->node_map_lock);
__ocfs2_node_map_clear_bit(map, bit);
spin_unlock(&osb->node_map_lock);
}
int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
{
int ret;
if (bit >= map->num_nodes) {
mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
BUG();
}
spin_lock(&osb->node_map_lock);
ret = test_bit(bit, map->map);
spin_unlock(&osb->node_map_lock);
return ret;
}

45
fs/ocfs2/heartbeat.h Normal file
View file

@ -0,0 +1,45 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* heartbeat.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_HEARTBEAT_H
#define OCFS2_HEARTBEAT_H
void ocfs2_init_node_maps(struct ocfs2_super *osb);
void ocfs2_do_node_down(int node_num, void *data);
/* node map functions - used to keep track of mounted and in-recovery
* nodes. */
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
#endif /* OCFS2_HEARTBEAT_H */

1461
fs/ocfs2/inode.c Normal file

File diff suppressed because it is too large Load diff

186
fs/ocfs2/inode.h Normal file
View file

@ -0,0 +1,186 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* inode.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_INODE_H
#define OCFS2_INODE_H
#include "extent_map.h"
/* OCFS2 Inode Private Data */
struct ocfs2_inode_info
{
u64 ip_blkno;
struct ocfs2_lock_res ip_rw_lockres;
struct ocfs2_lock_res ip_inode_lockres;
struct ocfs2_lock_res ip_open_lockres;
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
/* Number of outstanding AIO's which are not page aligned */
struct mutex ip_unaligned_aio;
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
struct list_head ip_io_markers;
u32 ip_clusters;
u16 ip_dyn_features;
struct mutex ip_io_mutex;
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
struct ocfs2_caching_info ip_metadata_cache;
struct ocfs2_extent_map ip_extent_map;
struct inode vfs_inode;
struct jbd2_inode ip_jinode;
u32 ip_dir_start_lookup;
/* Only valid if the inode is the dir. */
u32 ip_last_used_slot;
u64 ip_last_used_group;
u32 ip_dir_lock_gen;
struct ocfs2_alloc_reservation ip_la_data_resv;
/*
* Transactions that contain inode's metadata needed to complete
* fsync and fdatasync, respectively.
*/
tid_t i_sync_tid;
tid_t i_datasync_tid;
};
/*
* Flags for the ip_flags field
*/
/* System file inodes */
#define OCFS2_INODE_SYSTEM_FILE 0x00000001
#define OCFS2_INODE_JOURNAL 0x00000002
#define OCFS2_INODE_BITMAP 0x00000004
/* This inode has been wiped from disk */
#define OCFS2_INODE_DELETED 0x00000008
/* Has the inode been orphaned on another node?
*
* This hints to ocfs2_drop_inode that it should clear i_nlink before
* continuing.
*
* We *only* set this on unlink vote from another node. If the inode
* was locally orphaned, then we're sure of the state and don't need
* to twiddle i_nlink later - it's either zero or not depending on
* whether our unlink succeeded. Otherwise we got this from a node
* whose intention was to orphan the inode, however he may have
* crashed, failed etc, so we let ocfs2_drop_inode zero the value and
* rely on ocfs2_delete_inode to sort things out under the proper
* cluster locks.
*/
#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010
/* Does someone have the file open O_DIRECT */
#define OCFS2_INODE_OPEN_DIRECT 0x00000020
/* Tell the inode wipe code it's not in orphan dir */
#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
return container_of(inode, struct ocfs2_inode_info, vfs_inode);
}
#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
extern struct kmem_cache *ocfs2_inode_cache;
extern const struct address_space_operations ocfs2_aops;
extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
{
return &OCFS2_I(inode)->ip_metadata_cache;
}
void ocfs2_evict_inode(struct inode *inode);
int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
int create_ino);
void ocfs2_read_inode(struct inode *inode);
void ocfs2_read_inode2(struct inode *inode, void *opaque);
ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
size_t size, loff_t *offp);
void ocfs2_sync_blockdev(struct super_block *sb);
void ocfs2_refresh_inode(struct inode *inode,
struct ocfs2_dinode *fe);
int ocfs2_mark_inode_dirty(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
struct buffer_head *ocfs2_bread(struct inode *inode,
int block, int *err, int reada);
void ocfs2_set_inode_flags(struct inode *inode);
void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
{
int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
}
/* Validate that a bh contains a valid inode */
int ocfs2_validate_inode_block(struct super_block *sb,
struct buffer_head *bh);
/*
* Read an inode block into *bh. If *bh is NULL, a bh will be allocated.
* This is a cached read. The inode will be validated with
* ocfs2_validate_inode_block().
*/
int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
/* The same, but can be passed OCFS2_BH_* flags */
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int flags);
static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci)
{
return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
}
#endif /* OCFS2_INODE_H */

1005
fs/ocfs2/ioctl.c Normal file

File diff suppressed because it is too large Load diff

16
fs/ocfs2/ioctl.h Normal file
View file

@ -0,0 +1,16 @@
/*
* ioctl.h
*
* Function prototypes
*
* Copyright (C) 2006 Herbert Poetzl
*
*/
#ifndef OCFS2_IOCTL_PROTO_H
#define OCFS2_IOCTL_PROTO_H
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
#endif /* OCFS2_IOCTL_PROTO_H */

2252
fs/ocfs2/journal.c Normal file

File diff suppressed because it is too large Load diff

640
fs/ocfs2/journal.h Normal file
View file

@ -0,0 +1,640 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* journal.h
*
* Defines journalling api and structures.
*
* Copyright (C) 2003, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_JOURNAL_H
#define OCFS2_JOURNAL_H
#include <linux/fs.h>
#include <linux/jbd2.h>
enum ocfs2_journal_state {
OCFS2_JOURNAL_FREE = 0,
OCFS2_JOURNAL_LOADED,
OCFS2_JOURNAL_IN_SHUTDOWN,
};
struct ocfs2_super;
struct ocfs2_dinode;
/*
* The recovery_list is a simple linked list of node numbers to recover.
* It is protected by the recovery_lock.
*/
struct ocfs2_recovery_map {
unsigned int rm_used;
unsigned int *rm_entries;
};
struct ocfs2_journal {
enum ocfs2_journal_state j_state; /* Journals current state */
journal_t *j_journal; /* The kernels journal type */
struct inode *j_inode; /* Kernel inode pointing to
* this journal */
struct ocfs2_super *j_osb; /* pointer to the super
* block for the node
* we're currently
* running on -- not
* necessarily the super
* block from the node
* which we usually run
* from (recovery,
* etc) */
struct buffer_head *j_bh; /* Journal disk inode block */
atomic_t j_num_trans; /* Number of transactions
* currently in the system. */
spinlock_t j_lock;
unsigned long j_trans_id;
struct rw_semaphore j_trans_barrier;
wait_queue_head_t j_checkpointed;
/* both fields protected by j_lock*/
struct list_head j_la_cleanups;
struct work_struct j_recovery_work;
};
extern spinlock_t trans_inc_lock;
/* wrap j_trans_id so we never have it equal to zero. */
static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
{
unsigned long old_id;
spin_lock(&trans_inc_lock);
old_id = j->j_trans_id++;
if (unlikely(!j->j_trans_id))
j->j_trans_id = 1;
spin_unlock(&trans_inc_lock);
return old_id;
}
static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal,
struct ocfs2_caching_info *ci)
{
spin_lock(&trans_inc_lock);
ci->ci_last_trans = journal->j_trans_id;
spin_unlock(&trans_inc_lock);
}
/* Used to figure out whether it's safe to drop a metadata lock on an
* cached object. Returns true if all the object's changes have been
* checkpointed to disk. You should be holding the spinlock on the
* metadata lock while calling this to be sure that nobody can take
* the lock and put it on another transaction. */
static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci)
{
int ret;
struct ocfs2_journal *journal =
OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
spin_lock(&trans_inc_lock);
ret = time_after(journal->j_trans_id, ci->ci_last_trans);
spin_unlock(&trans_inc_lock);
return ret;
}
/* convenience function to check if an object backed by struct
* ocfs2_caching_info is still new (has never hit disk) Will do you a
* favor and set created_trans = 0 when you've
* been checkpointed. returns '1' if the ci is still new. */
static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci)
{
int ret;
struct ocfs2_journal *journal =
OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
spin_lock(&trans_inc_lock);
ret = !(time_after(journal->j_trans_id, ci->ci_created_trans));
if (!ret)
ci->ci_created_trans = 0;
spin_unlock(&trans_inc_lock);
return ret;
}
/* Wrapper for inodes so we can check system files */
static inline int ocfs2_inode_is_new(struct inode *inode)
{
/* System files are never "new" as they're written out by
* mkfs. This helps us early during mount, before we have the
* journal open and j_trans_id could be junk. */
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
return 0;
return ocfs2_ci_is_new(INODE_CACHE(inode));
}
static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
struct ocfs2_caching_info *ci)
{
spin_lock(&trans_inc_lock);
ci->ci_created_trans = osb->journal->j_trans_id;
spin_unlock(&trans_inc_lock);
}
/* Exported only for the journal struct init code in super.c. Do not call. */
void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
void ocfs2_complete_recovery(struct work_struct *work);
void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
int ocfs2_recovery_init(struct ocfs2_super *osb);
void ocfs2_recovery_exit(struct ocfs2_super *osb);
int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
/*
* Journal Control:
* Initialize, Load, Shutdown, Wipe a journal.
*
* ocfs2_journal_init - Initialize journal structures in the OSB.
* ocfs2_journal_load - Load the given journal off disk. Replay it if
* there's transactions still in there.
* ocfs2_journal_shutdown - Shutdown a journal, this will flush all
* uncommitted, uncheckpointed transactions.
* ocfs2_journal_wipe - Wipe transactions from a journal. Optionally
* zero out each block.
* ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb.
* ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
* event on.
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
*/
void ocfs2_set_journal_params(struct ocfs2_super *osb);
int ocfs2_journal_init(struct ocfs2_journal *journal,
int *dirty);
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
int full);
int ocfs2_journal_load(struct ocfs2_journal *journal, int local,
int replayed);
int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
void ocfs2_recovery_thread(struct ocfs2_super *osb,
int node_num);
int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
{
wake_up(&osb->checkpoint_event);
}
static inline void ocfs2_checkpoint_inode(struct inode *inode)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (ocfs2_mount_local(osb))
return;
if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) {
/* WARNING: This only kicks off a single
* checkpoint. If someone races you and adds more
* metadata to the journal, you won't know, and will
* wind up waiting *a lot* longer than necessary. Right
* now we only use this in clear_inode so that's
* OK. */
ocfs2_start_checkpoint(osb);
wait_event(osb->journal->j_checkpointed,
ocfs2_ci_fully_checkpointed(INODE_CACHE(inode)));
}
}
/*
* Transaction Handling:
* Manage the lifetime of a transaction handle.
*
* ocfs2_start_trans - Begin a transaction. Give it an upper estimate of
* the number of blocks that will be changed during
* this handle.
* ocfs2_commit_trans - Complete a handle. It might return -EIO if
* the journal was aborted. The majority of paths don't
* check the return value as an error there comes too
* late to do anything (and will be picked up in a
* later transaction).
* ocfs2_extend_trans - Extend a handle by nblocks credits. This may
* commit the handle to disk in the process, but will
* not release any locks taken during the transaction.
* ocfs2_journal_access* - Notify the handle that we want to journal this
* buffer. Will have to call ocfs2_journal_dirty once
* we've actually dirtied it. Type is one of . or .
* Always call the specific flavor of
* ocfs2_journal_access_*() unless you intend to
* manage the checksum by hand.
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
* ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
* the current handle commits.
*/
/* You must always start_trans with a number of buffs > 0, but it's
* perfectly legal to go through an entire transaction without having
* dirtied any buffers. */
handle_t *ocfs2_start_trans(struct ocfs2_super *osb,
int max_buffs);
int ocfs2_commit_trans(struct ocfs2_super *osb,
handle_t *handle);
int ocfs2_extend_trans(handle_t *handle, int nblocks);
int ocfs2_allocate_extend_trans(handle_t *handle,
int thresh);
/*
* Define an arbitrary limit for the amount of data we will anticipate
* writing to any given transaction. For unbounded transactions such as
* fallocate(2) we can write more than this, but we always
* start off at the maximum transaction size and grow the transaction
* optimistically as we go.
*/
#define OCFS2_MAX_TRANS_DATA 64U
/*
* Create access is for when we get a newly created buffer and we're
* not gonna read it off disk, but rather fill it ourselves. Right
* now, we don't do anything special with this (it turns into a write
* request), but this is a good placeholder in case we do...
*
* Write access is for when we read a block off disk and are going to
* modify it. This way the journalling layer knows it may need to make
* a copy of that block (if it's part of another, uncommitted
* transaction) before we do so.
*/
#define OCFS2_JOURNAL_ACCESS_CREATE 0
#define OCFS2_JOURNAL_ACCESS_WRITE 1
#define OCFS2_JOURNAL_ACCESS_UNDO 2
/* ocfs2_inode */
int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_extent_block */
int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_refcount_block */
int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_group_desc */
int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_xattr_block */
int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* quota blocks */
int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* dirblock */
int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_dx_root_block */
int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_dx_leaf */
int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* Anything that has no ecc */
int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/*
* A word about the journal_access/journal_dirty "dance". It is
* entirely legal to journal_access a buffer more than once (as long
* as the access type is the same -- I'm not sure what will happen if
* access type is different but this should never happen anyway) It is
* also legal to journal_dirty a buffer more than once. In fact, you
* can even journal_access a buffer after you've done a
* journal_access/journal_dirty pair. The only thing you cannot do
* however, is journal_dirty a buffer which you haven't yet passed to
* journal_access at least once.
*
* That said, 99% of the time this doesn't matter and this is what the
* path looks like:
*
* <read a bh>
* ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE);
* <modify the bh>
* ocfs2_journal_dirty(handle, bh);
*/
void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
/*
* Credit Macros:
* Convenience macros to calculate number of credits needed.
*
* For convenience sake, I have a set of macros here which calculate
* the *maximum* number of sectors which will be changed for various
* metadata updates.
*/
/* simple file updates like chmod, etc. */
#define OCFS2_INODE_UPDATE_CREDITS 1
/* extended attribute block update */
#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
/* Update of a single quota block */
#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
/* global quotafile inode update, data block */
#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
/*
* The two writes below can accidentally see global info dirty due
* to set_info() quotactl so make them prepared for the writes.
*/
/* quota data block, global info */
/* Write to local quota file */
#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
/* global quota data block, local quota data block, global quota inode,
* global quota info */
#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
static inline int ocfs2_quota_trans_credits(struct super_block *sb)
{
int credits = 0;
if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
credits += OCFS2_QWRITE_CREDITS;
if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
credits += OCFS2_QWRITE_CREDITS;
return credits;
}
/* group extend. inode update and last group update. */
#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
/* group add. inode update and the new group update. */
#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
/* get one bit out of a suballocator: dinode + group descriptor +
* prev. group desc. if we relink. */
#define OCFS2_SUBALLOC_ALLOC (3)
static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
{
return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
ocfs2_quota_trans_credits(sb);
}
/* dinode + group descriptor update. We don't relink on free yet. */
#define OCFS2_SUBALLOC_FREE (2)
#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
+ OCFS2_TRUNCATE_LOG_UPDATE)
static inline int ocfs2_remove_extent_credits(struct super_block *sb)
{
return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
ocfs2_quota_trans_credits(sb);
}
/* data block for new dir/symlink, allocation of directory block, dx_root
* update for free list */
#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
{
/* 1 block for index, 2 allocs (data, metadata), 1 clusters
* worth of blocks for initial extent. */
return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
ocfs2_clusters_to_blocks(sb, 1);
}
/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
* alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
* blocks + quota update */
static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
int xattr_credits)
{
int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
if (is_dir)
dir_credits += ocfs2_add_dir_index_credits(sb);
return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
ocfs2_quota_trans_credits(sb);
}
/* local alloc metadata change + main bitmap updates */
#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
+ OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
/* used when we don't need an allocation change for a dir extend. One
* for the dinode, one for the new block. */
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
* update on dir + index leaf + dx root update for free list +
* previous dirblock update in the free list */
static inline int ocfs2_link_credits(struct super_block *sb)
{
return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
ocfs2_quota_trans_credits(sb);
}
/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
* dir inode link + dir inode index leaf + dir index root */
static inline int ocfs2_unlink_credits(struct super_block *sb)
{
/* The quota update from ocfs2_link_credits is unused here... */
return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
}
/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
* inode alloc group descriptor + orphan dir index root +
* orphan dir index leaf */
#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
/* dinode update, old dir dinode update, new dir dinode update, old
* dir dir entry, new dir dir entry, dir entry update for renaming
* directory + target unlink + 3 x dir index leaves */
static inline int ocfs2_rename_credits(struct super_block *sb)
{
return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
}
/* global bitmap dinode, group desc., relinked group,
* suballocator dinode, group desc., relinked group,
* dinode, xattr block */
#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+ OCFS2_INODE_UPDATE_CREDITS \
+ OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
/* inode update, removal of dx root block from allocator */
#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
OCFS2_SUBALLOC_FREE)
static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
{
int credits = 1 + OCFS2_SUBALLOC_ALLOC;
credits += ocfs2_clusters_to_blocks(sb, 1);
credits += ocfs2_quota_trans_credits(sb);
return credits;
}
/* inode update, new refcount block and its allocation credits. */
#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \
+ OCFS2_SUBALLOC_ALLOC)
/* inode and the refcount block update. */
#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
/*
* inode and the refcount block update.
* It doesn't include the credits for sub alloc change.
* So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added.
*/
#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
/* 2 metadata alloc, 2 new blocks and root refcount block */
#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
/*
* Please note that the caller must make sure that root_el is the root
* of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
* the result may be wrong.
*/
static inline int ocfs2_calc_extend_credits(struct super_block *sb,
struct ocfs2_extent_list *root_el)
{
int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
/* bitmap dinode, group desc. + relinked group. */
bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
/* we might need to shift tree depth so lets assume an
* absolute worst case of complete fragmentation. Even with
* that, we only need one update for the dinode, and then
* however many metadata chunks needed * a remaining suballoc
* alloc. */
sysfile_bitmap_blocks = 1 +
(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
/* this does not include *new* metadata blocks, which are
* accounted for in sysfile_bitmap_blocks. root_el +
* prev. last_eb_blk + blocks along edge of tree.
* calc_symlink_credits passes because we just need 1
* credit for the dinode there. */
extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
ocfs2_quota_trans_credits(sb);
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
{
int blocks = ocfs2_mknod_credits(sb, 0, 0);
/* links can be longer than one block so we may update many
* within our single allocated extent. */
blocks += ocfs2_clusters_to_blocks(sb, 1);
return blocks + ocfs2_quota_trans_credits(sb);
}
static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
unsigned int cpg)
{
int blocks;
int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
/* parent inode update + new block group header + bitmap inode update
+ bitmap blocks affected */
blocks = 1 + 1 + 1 + bitmap_blocks;
return blocks;
}
/*
* Allocating a discontiguous block group requires the credits from
* ocfs2_calc_group_alloc_credits() as well as enough credits to fill
* the group descriptor's extent list. The caller already has started
* the transaction with ocfs2_calc_group_alloc_credits(). They extend
* it with these credits.
*/
static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
{
return ocfs2_extent_recs_per_gd(sb);
}
static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
unsigned int clusters_to_del,
struct ocfs2_dinode *fe,
struct ocfs2_extent_list *last_el)
{
/* for dinode + all headers in this pass + update to next leaf */
u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
int credits = 1 + tree_depth + 1;
int i;
i = next_free - 1;
BUG_ON(i < 0);
/* We may be deleting metadata blocks, so metadata alloc dinode +
one desc. block for each possible delete. */
if (tree_depth && next_free == 1 &&
ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
credits += 1 + tree_depth;
/* update to the truncate log. */
credits += OCFS2_TRUNCATE_LOG_UPDATE;
credits += ocfs2_quota_trans_credits(sb);
return credits;
}
static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
{
return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
}
static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
return jbd2_journal_begin_ordered_truncate(
OCFS2_SB(inode->i_sb)->journal->j_journal,
&OCFS2_I(inode)->ip_jinode,
new_size);
}
static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
struct inode *inode,
int datasync)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
oi->i_sync_tid = handle->h_transaction->t_tid;
if (datasync)
oi->i_datasync_tid = handle->h_transaction->t_tid;
}
#endif /* OCFS2_JOURNAL_H */

1343
fs/ocfs2/localalloc.c Normal file

File diff suppressed because it is too large Load diff

68
fs/ocfs2/localalloc.h Normal file
View file

@ -0,0 +1,68 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* localalloc.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_LOCALALLOC_H
#define OCFS2_LOCALALLOC_H
int ocfs2_load_local_alloc(struct ocfs2_super *osb);
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
int node_num,
struct ocfs2_dinode **alloc_copy);
int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc);
int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
u64 bits);
struct ocfs2_alloc_context;
int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
u32 bits_wanted,
struct ocfs2_alloc_context *ac);
int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
u32 *bit_off,
u32 *num_bits);
int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bit_off,
u32 num_bits);
void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
unsigned int num_clusters);
void ocfs2_la_enable_worker(struct work_struct *work);
#endif /* OCFS2_LOCALALLOC_H */

141
fs/ocfs2/locks.c Normal file
View file

@ -0,0 +1,141 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* locks.c
*
* Userspace file locking support
*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/fcntl.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
#include "locks.h"
static int ocfs2_do_flock(struct file *file, struct inode *inode,
int cmd, struct file_lock *fl)
{
int ret = 0, level = 0, trylock = 0;
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
if (fl->fl_type == F_WRLCK)
level = 1;
if (!IS_SETLKW(cmd))
trylock = 1;
mutex_lock(&fp->fp_mutex);
if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
lockres->l_level > LKM_NLMODE) {
int old_level = 0;
if (lockres->l_level == LKM_EXMODE)
old_level = 1;
if (level == old_level)
goto out;
/*
* Converting an existing lock is not guaranteed to be
* atomic, so we can get away with simply unlocking
* here and allowing the lock code to try at the new
* level.
*/
flock_lock_file_wait(file,
&(struct file_lock){.fl_type = F_UNLCK});
ocfs2_file_unlock(file);
}
ret = ocfs2_file_lock(file, level, trylock);
if (ret) {
if (ret == -EAGAIN && trylock)
ret = -EWOULDBLOCK;
else
mlog_errno(ret);
goto out;
}
ret = flock_lock_file_wait(file, fl);
if (ret)
ocfs2_file_unlock(file);
out:
mutex_unlock(&fp->fp_mutex);
return ret;
}
static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
{
int ret;
struct ocfs2_file_private *fp = file->private_data;
mutex_lock(&fp->fp_mutex);
ocfs2_file_unlock(file);
ret = flock_lock_file_wait(file, fl);
mutex_unlock(&fp->fp_mutex);
return ret;
}
/*
* Overall flow of ocfs2_flock() was influenced by gfs2_flock().
*/
int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
if (__mandatory_lock(inode))
return -ENOLCK;
if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
ocfs2_mount_local(osb))
return flock_lock_file_wait(file, fl);
if (fl->fl_type == F_UNLCK)
return ocfs2_do_funlock(file, cmd, fl);
else
return ocfs2_do_flock(file, inode, cmd, fl);
}
int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
}

32
fs/ocfs2/locks.h Normal file
View file

@ -0,0 +1,32 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* locks.h
*
* Function prototypes for Userspace file locking support
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_LOCKS_H
#define OCFS2_LOCKS_H
int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
#endif /* OCFS2_LOCKS_H */

194
fs/ocfs2/mmap.c Normal file
View file

@ -0,0 +1,194 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* mmap.c
*
* Code to deal with the mess that is clustered mmap.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
#include <linux/signal.h>
#include <linux/rbtree.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "aops.h"
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
#include "mmap.h"
#include "super.h"
#include "ocfs2_trace.h"
static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
{
sigset_t oldset;
int ret;
ocfs2_block_signals(&oldset);
ret = filemap_fault(area, vmf);
ocfs2_unblock_signals(&oldset);
trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
area, vmf->page, vmf->pgoff);
return ret;
}
static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct page *page)
{
int ret = VM_FAULT_NOPAGE;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
unsigned int len = PAGE_CACHE_SIZE;
pgoff_t last_index;
struct page *locked_page = NULL;
void *fsdata;
loff_t size = i_size_read(inode);
last_index = (size - 1) >> PAGE_CACHE_SHIFT;
/*
* There are cases that lead to the page no longer bebongs to the
* mapping.
* 1) pagecache truncates locally due to memory pressure.
* 2) pagecache truncates when another is taking EX lock against
* inode lock. see ocfs2_data_convert_worker.
*
* The i_size check doesn't catch the case where nodes truncated and
* then re-extended the file. We'll re-check the page mapping after
* taking the page lock inside of ocfs2_write_begin_nolock().
*
* Let VM retry with these cases.
*/
if ((page->mapping != inode->i_mapping) ||
(!PageUptodate(page)) ||
(page_offset(page) >= size))
goto out;
/*
* Call ocfs2_write_begin() and ocfs2_write_end() to take
* advantage of the allocation code there. We pass a write
* length of the whole page (chopped to i_size) to make sure
* the whole thing is allocated.
*
* Since we know the page is up to date, we don't have to
* worry about ocfs2_write_begin() skipping some buffer reads
* because the "write" would invalidate their data.
*/
if (page->index == last_index)
len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
&fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
if (ret == -ENOMEM)
ret = VM_FAULT_OOM;
else
ret = VM_FAULT_SIGBUS;
goto out;
}
if (!locked_page) {
ret = VM_FAULT_NOPAGE;
goto out;
}
ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
fsdata);
BUG_ON(ret != len);
ret = VM_FAULT_LOCKED;
out:
return ret;
}
static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
struct buffer_head *di_bh = NULL;
sigset_t oldset;
int ret;
sb_start_pagefault(inode->i_sb);
ocfs2_block_signals(&oldset);
/*
* The cluster locks taken will block a truncate from another
* node. Taking the data lock will also ensure that we don't
* attempt page truncation as part of a downconvert.
*/
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
/*
* The alloc sem should be enough to serialize with
* ocfs2_truncate_file() changing i_size as well as any thread
* modifying the inode btree.
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
brelse(di_bh);
ocfs2_inode_unlock(inode, 1);
out:
ocfs2_unblock_signals(&oldset);
sb_end_pagefault(inode->i_sb);
return ret;
}
static const struct vm_operations_struct ocfs2_file_vm_ops = {
.fault = ocfs2_fault,
.page_mkwrite = ocfs2_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret = 0, lock_level = 0;
ret = ocfs2_inode_lock_atime(file_inode(file),
file->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
ocfs2_inode_unlock(file_inode(file), lock_level);
out:
vma->vm_ops = &ocfs2_file_vm_ops;
return 0;
}

6
fs/ocfs2/mmap.h Normal file
View file

@ -0,0 +1,6 @@
#ifndef OCFS2_MMAP_H
#define OCFS2_MMAP_H
int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
#endif /* OCFS2_MMAP_H */

1078
fs/ocfs2/move_extents.c Normal file

File diff suppressed because it is too large Load diff

22
fs/ocfs2/move_extents.h Normal file
View file

@ -0,0 +1,22 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* move_extents.h
*
* Copyright (C) 2011 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef OCFS2_MOVE_EXTENTS_H
#define OCFS2_MOVE_EXTENTS_H
int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
#endif /* OCFS2_MOVE_EXTENTS_H */

2691
fs/ocfs2/namei.c Normal file

File diff suppressed because it is too large Load diff

45
fs/ocfs2/namei.h Normal file
View file

@ -0,0 +1,45 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* namei.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_NAMEI_H
#define OCFS2_NAMEI_H
extern const struct inode_operations ocfs2_dir_iops;
struct dentry *ocfs2_get_parent(struct dentry *child);
int ocfs2_orphan_del(struct ocfs2_super *osb,
handle_t *handle,
struct inode *orphan_dir_inode,
struct inode *inode,
struct buffer_head *orphan_dir_bh);
int ocfs2_create_inode_in_orphan(struct inode *dir,
int mode,
struct inode **new_inode);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct inode *new_inode,
struct dentry *new_dentry);
#endif /* OCFS2_NAMEI_H */

109
fs/ocfs2/ocfs1_fs_compat.h Normal file
View file

@ -0,0 +1,109 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs1_fs_compat.h
*
* OCFS1 volume header definitions. OCFS2 creates valid but unmountable
* OCFS1 volume headers on the first two sectors of an OCFS2 volume.
* This allows an OCFS1 volume to see the partition and cleanly fail to
* mount it.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef _OCFS1_FS_COMPAT_H
#define _OCFS1_FS_COMPAT_H
#define OCFS1_MAX_VOL_SIGNATURE_LEN 128
#define OCFS1_MAX_MOUNT_POINT_LEN 128
#define OCFS1_MAX_VOL_ID_LENGTH 16
#define OCFS1_MAX_VOL_LABEL_LEN 64
#define OCFS1_MAX_CLUSTER_NAME_LEN 64
#define OCFS1_MAJOR_VERSION (2)
#define OCFS1_MINOR_VERSION (0)
#define OCFS1_VOLUME_SIGNATURE "OracleCFS"
/*
* OCFS1 superblock. Lives at sector 0.
*/
struct ocfs1_vol_disk_hdr
{
/*00*/ __u32 minor_version;
__u32 major_version;
/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
/*108*/ __u64 serial_num;
/*110*/ __u64 device_size;
__u64 start_off;
/*120*/ __u64 bitmap_off;
__u64 publ_off;
/*130*/ __u64 vote_off;
__u64 root_bitmap_off;
/*140*/ __u64 data_start_off;
__u64 root_bitmap_size;
/*150*/ __u64 root_off;
__u64 root_size;
/*160*/ __u64 cluster_size;
__u64 num_nodes;
/*170*/ __u64 num_clusters;
__u64 dir_node_size;
/*180*/ __u64 file_node_size;
__u64 internal_off;
/*190*/ __u64 node_cfg_off;
__u64 node_cfg_size;
/*1A0*/ __u64 new_cfg_off;
__u32 prot_bits;
__s32 excl_mount;
/*1B0*/
};
struct ocfs1_disk_lock
{
/*00*/ __u32 curr_master;
__u8 file_lock;
__u8 compat_pad[3]; /* Not in original definition. Used to
make the already existing alignment
explicit */
__u64 last_write_time;
/*10*/ __u64 last_read_time;
__u32 writer_node_num;
__u32 reader_node_num;
/*20*/ __u64 oin_node_map;
__u64 dlock_seq_num;
/*30*/
};
/*
* OCFS1 volume label. Lives at sector 1.
*/
struct ocfs1_vol_label
{
/*00*/ struct ocfs1_disk_lock disk_lock;
/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN];
/*70*/ __u16 label_len;
/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
/*82*/ __u16 vol_id_len;
/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
/*A4*/ __u16 cluster_name_len;
/*A6*/
};
#endif /* _OCFS1_FS_COMPAT_H */

888
fs/ocfs2/ocfs2.h Normal file
View file

@ -0,0 +1,888 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2.h
*
* Defines macros and structures used in OCFS2
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_H
#define OCFS2_H
#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <linux/rbtree.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/mutex.h>
#include <linux/lockdep.h>
#include <linux/jbd2.h>
/* For union ocfs2_dlm_lksb */
#include "stackglue.h"
#include "ocfs2_fs.h"
#include "ocfs2_lockid.h"
#include "ocfs2_ioctl.h"
/* For struct ocfs2_blockcheck_stats */
#include "blockcheck.h"
#include "reservations.h"
/* Caching of metadata buffers */
/* Most user visible OCFS2 inodes will have very few pieces of
* metadata, but larger files (including bitmaps, etc) must be taken
* into account when designing an access scheme. We allow a small
* amount of inlined blocks to be stored on an array and grow the
* structure into a rb tree when necessary. */
#define OCFS2_CACHE_INFO_MAX_ARRAY 2
/* Flags for ocfs2_caching_info */
enum ocfs2_caching_info_flags {
/* Indicates that the metadata cache is using the inline array */
OCFS2_CACHE_FL_INLINE = 1<<1,
};
struct ocfs2_caching_operations;
struct ocfs2_caching_info {
/*
* The parent structure provides the locks, but because the
* parent structure can differ, it provides locking operations
* to struct ocfs2_caching_info.
*/
const struct ocfs2_caching_operations *ci_ops;
/* next two are protected by trans_inc_lock */
/* which transaction were we created on? Zero if none. */
unsigned long ci_created_trans;
/* last transaction we were a part of. */
unsigned long ci_last_trans;
/* Cache structures */
unsigned int ci_flags;
unsigned int ci_num_cached;
union {
sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY];
struct rb_root ci_tree;
} ci_cache;
};
/*
* Need this prototype here instead of in uptodate.h because journal.h
* uses it.
*/
struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci);
/* this limits us to 256 nodes
* if we need more, we can do a kmalloc for the map */
#define OCFS2_NODE_MAP_MAX_NODES 256
struct ocfs2_node_map {
u16 num_nodes;
unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
};
enum ocfs2_ast_action {
OCFS2_AST_INVALID = 0,
OCFS2_AST_ATTACH,
OCFS2_AST_CONVERT,
OCFS2_AST_DOWNCONVERT,
};
/* actions for an unlockast function to take. */
enum ocfs2_unlock_action {
OCFS2_UNLOCK_INVALID = 0,
OCFS2_UNLOCK_CANCEL_CONVERT,
OCFS2_UNLOCK_DROP_LOCK,
};
/* ocfs2_lock_res->l_flags flags. */
#define OCFS2_LOCK_ATTACHED (0x00000001) /* we have initialized
* the lvb */
#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in
* dlm_lock */
#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to
* downconvert*/
#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */
#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
#define OCFS2_LOCK_REFRESHING (0x00000020)
#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization
* for shutdown paths */
#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track
* when to skip queueing
* a lock because it's
* about to be
* dropped. */
#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a
call to dlm_lock. Only
exists with BUSY set. */
#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread
* from downconverting
* before the upconvert
* has completed */
struct ocfs2_lock_res_ops;
typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
#ifdef CONFIG_OCFS2_FS_STATS
struct ocfs2_lock_stats {
u64 ls_total; /* Total wait in NSEC */
u32 ls_gets; /* Num acquires */
u32 ls_fail; /* Num failed acquires */
/* Storing max wait in usecs saves 24 bytes per inode */
u32 ls_max; /* Max wait in USEC */
};
#endif
struct ocfs2_lock_res {
void *l_priv;
struct ocfs2_lock_res_ops *l_ops;
struct list_head l_blocked_list;
struct list_head l_mask_waiters;
unsigned long l_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
unsigned int l_ro_holders;
unsigned int l_ex_holders;
signed char l_level;
signed char l_requested;
signed char l_blocking;
/* Data packed - type enum ocfs2_lock_type */
unsigned char l_type;
/* used from AST/BAST funcs. */
/* Data packed - enum type ocfs2_ast_action */
unsigned char l_action;
/* Data packed - enum type ocfs2_unlock_action */
unsigned char l_unlock_action;
unsigned int l_pending_gen;
spinlock_t l_lock;
struct ocfs2_dlm_lksb l_lksb;
wait_queue_head_t l_event;
struct list_head l_debug_list;
#ifdef CONFIG_OCFS2_FS_STATS
struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */
u32 l_lock_refresh; /* Disk refreshes */
struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map l_lockdep_map;
#endif
};
enum ocfs2_orphan_scan_state {
ORPHAN_SCAN_ACTIVE,
ORPHAN_SCAN_INACTIVE
};
struct ocfs2_orphan_scan {
struct mutex os_lock;
struct ocfs2_super *os_osb;
struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
struct delayed_work os_orphan_scan_work;
struct timespec os_scantime; /* time this node ran the scan */
u32 os_count; /* tracks node specific scans */
u32 os_seqno; /* tracks cluster wide scans */
atomic_t os_state; /* ACTIVE or INACTIVE */
};
struct ocfs2_dlm_debug {
struct kref d_refcnt;
struct dentry *d_locking_state;
struct list_head d_lockres_tracking;
};
enum ocfs2_vol_state
{
VOLUME_INIT = 0,
VOLUME_MOUNTED,
VOLUME_MOUNTED_QUOTAS,
VOLUME_DISMOUNTED,
VOLUME_DISABLED
};
struct ocfs2_alloc_stats
{
atomic_t moves;
atomic_t local_data;
atomic_t bitmap_data;
atomic_t bg_allocs;
atomic_t bg_extends;
};
enum ocfs2_local_alloc_state
{
OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for
* this mountpoint. */
OCFS2_LA_ENABLED, /* Local alloc is in use. */
OCFS2_LA_THROTTLED, /* Local alloc is in use, but number
* of bits has been reduced. */
OCFS2_LA_DISABLED /* Local alloc has temporarily been
* disabled. */
};
enum ocfs2_mount_options
{
OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */
OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access
control lists */
OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
writes */
OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
};
#define OCFS2_OSB_SOFT_RO 0x0001
#define OCFS2_OSB_HARD_RO 0x0002
#define OCFS2_OSB_ERROR_FS 0x0004
#define OCFS2_DEFAULT_ATIME_QUANTUM 60
struct ocfs2_journal;
struct ocfs2_slot_info;
struct ocfs2_recovery_map;
struct ocfs2_replay_map;
struct ocfs2_quota_recovery;
struct ocfs2_super
{
struct task_struct *commit_task;
struct super_block *sb;
struct inode *root_inode;
struct inode *sys_root_inode;
struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
struct inode **local_system_inodes;
struct ocfs2_slot_info *slot_info;
u32 *slot_recovery_generations;
spinlock_t node_map_lock;
u64 root_blkno;
u64 system_dir_blkno;
u64 bitmap_blkno;
u32 bitmap_cpg;
u8 *uuid;
char *uuid_str;
u32 uuid_hash;
u8 *vol_label;
u64 first_cluster_group_blkno;
u32 fs_generation;
u32 s_feature_compat;
u32 s_feature_incompat;
u32 s_feature_ro_compat;
/* Protects s_next_generation, osb_flags and s_inode_steal_slot.
* Could protect more on osb as it's very short lived.
*/
spinlock_t osb_lock;
u32 s_next_generation;
unsigned long osb_flags;
s16 s_inode_steal_slot;
s16 s_meta_steal_slot;
atomic_t s_num_inodes_stolen;
atomic_t s_num_meta_stolen;
unsigned long s_mount_opt;
unsigned int s_atime_quantum;
unsigned int max_slots;
unsigned int node_num;
int slot_num;
int preferred_slot;
int s_sectsize_bits;
int s_clustersize;
int s_clustersize_bits;
unsigned int s_xattr_inline_size;
atomic_t vol_state;
struct mutex recovery_lock;
struct ocfs2_recovery_map *recovery_map;
struct ocfs2_replay_map *replay_map;
struct task_struct *recovery_thread_task;
int disable_recovery;
wait_queue_head_t checkpoint_event;
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
struct delayed_work la_enable_wq;
/*
* Must hold local alloc i_mutex and osb->osb_lock to change
* local_alloc_bits. Reads can be done under either lock.
*/
unsigned int local_alloc_bits;
unsigned int local_alloc_default_bits;
/* osb_clusters_at_boot can become stale! Do not trust it to
* be up to date. */
unsigned int osb_clusters_at_boot;
enum ocfs2_local_alloc_state local_alloc_state; /* protected
* by osb_lock */
struct buffer_head *local_alloc_bh;
u64 la_last_gd;
struct ocfs2_reservation_map osb_la_resmap;
unsigned int osb_resv_level;
unsigned int osb_dir_resv_level;
/* Next three fields are for local node slot recovery during
* mount. */
int dirty;
struct ocfs2_dinode *local_alloc_copy;
struct ocfs2_quota_recovery *quota_rec;
struct ocfs2_blockcheck_stats osb_ecc_stats;
struct ocfs2_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
u8 osb_stackflags;
char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
struct dentry *osb_ctxt;
wait_queue_head_t recovery_event;
spinlock_t dc_task_lock;
struct task_struct *dc_task;
wait_queue_head_t dc_event;
unsigned long dc_wake_sequence;
unsigned long dc_work_sequence;
/*
* Any thread can add locks to the list, but the downconvert
* thread is the only one allowed to remove locks. Any change
* to this rule requires updating
* ocfs2_downconvert_thread_do_work().
*/
struct list_head blocked_lock_list;
unsigned long blocked_lock_count;
/* List of dquot structures to drop last reference to */
struct llist_head dquot_drop_list;
struct work_struct dquot_drop_work;
wait_queue_head_t osb_mount_event;
/* Truncate log info */
struct inode *osb_tl_inode;
struct buffer_head *osb_tl_bh;
struct delayed_work osb_truncate_log_wq;
atomic_t osb_tl_disable;
/*
* How many clusters in our truncate log.
* It must be protected by osb_tl_inode->i_mutex.
*/
unsigned int truncated_clusters;
struct ocfs2_node_map osb_recovering_orphan_dirs;
unsigned int *osb_orphan_wipes;
wait_queue_head_t osb_wipe_event;
struct ocfs2_orphan_scan osb_orphan_scan;
/* used to protect metaecc calculation check of xattr. */
spinlock_t osb_xattr_lock;
unsigned int osb_dx_mask;
u32 osb_dx_seed[4];
/* the group we used to allocate inodes. */
u64 osb_inode_alloc_group;
/* rb tree root for refcount lock. */
struct rb_root osb_rf_lock_tree;
struct ocfs2_refcount_tree *osb_ref_tree_lru;
struct mutex system_file_mutex;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
/* Useful typedef for passing around journal access functions */
typedef int (*ocfs2_journal_access_func)(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
static inline int ocfs2_should_order_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
return 0;
if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
return 0;
return 1;
}
static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
return 1;
return 0;
}
static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
{
/*
* Support for sparse files is a pre-requisite
*/
if (!ocfs2_sparse_alloc(osb))
return 0;
if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
return 1;
return 0;
}
static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
return 1;
return 0;
}
static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
return 1;
return 0;
}
static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
return 1;
return 0;
}
static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
return 1;
return 0;
}
static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
return 1;
return 0;
}
static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
{
if (ocfs2_supports_indexed_dirs(osb))
return OCFS2_DX_LINK_MAX;
return OCFS2_LINK_MAX;
}
static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
{
u32 nlink = le16_to_cpu(di->i_links_count);
u32 hi = le16_to_cpu(di->i_links_count_hi);
if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
return nlink;
}
static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
{
u16 lo, hi;
lo = nlink;
hi = nlink >> OCFS2_LINKS_HI_SHIFT;
di->i_links_count = cpu_to_le16(lo);
di->i_links_count_hi = cpu_to_le16(hi);
}
static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
{
u32 links = ocfs2_read_links_count(di);
links += n;
ocfs2_set_links_count(di, links);
}
static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
return 1;
return 0;
}
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
* too! */
static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
unsigned long flag)
{
spin_lock(&osb->osb_lock);
osb->osb_flags |= flag;
spin_unlock(&osb->osb_lock);
}
static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
int hard)
{
spin_lock(&osb->osb_lock);
osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
if (hard)
osb->osb_flags |= OCFS2_OSB_HARD_RO;
else
osb->osb_flags |= OCFS2_OSB_SOFT_RO;
spin_unlock(&osb->osb_lock);
}
static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
{
int ret;
spin_lock(&osb->osb_lock);
ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
spin_unlock(&osb->osb_lock);
return ret;
}
static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
{
int ret;
spin_lock(&osb->osb_lock);
ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
spin_unlock(&osb->osb_lock);
return ret;
}
static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
{
return (osb->s_feature_incompat &
(OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
}
static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
{
if (ocfs2_clusterinfo_valid(osb) &&
memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
OCFS2_STACK_LABEL_LEN))
return 1;
return 0;
}
static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
{
if (ocfs2_clusterinfo_valid(osb) &&
!memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
OCFS2_STACK_LABEL_LEN))
return 1;
return 0;
}
static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
{
return ocfs2_o2cb_stack(osb) &&
(osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
}
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
{
return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
}
static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
{
return (osb->s_feature_incompat &
OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
}
#define OCFS2_IS_VALID_DINODE(ptr) \
(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \
(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
(!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
#define OCFS2_IS_VALID_DX_ROOT(ptr) \
(!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
#define OCFS2_IS_VALID_DX_LEAF(ptr) \
(!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \
(!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
static inline unsigned long ino_from_blkno(struct super_block *sb,
u64 blkno)
{
return (unsigned long)(blkno & (u64)ULONG_MAX);
}
static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
u32 clusters)
{
int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
sb->s_blocksize_bits;
return (u64)clusters << c_to_b_bits;
}
static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
u64 blocks)
{
int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
sb->s_blocksize_bits;
return (u32)(blocks >> b_to_c_bits);
}
static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
u64 bytes)
{
int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int clusters;
bytes += OCFS2_SB(sb)->s_clustersize - 1;
/* OCFS2 just cannot have enough clusters to overflow this */
clusters = (unsigned int)(bytes >> cl_bits);
return clusters;
}
static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
u64 bytes)
{
bytes += sb->s_blocksize - 1;
return bytes >> sb->s_blocksize_bits;
}
static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
u32 clusters)
{
return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
}
static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
u64 blocks)
{
int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
unsigned int clusters;
clusters = ocfs2_blocks_to_clusters(sb, blocks);
return (u64)clusters << bits;
}
static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
u64 bytes)
{
int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int clusters;
clusters = ocfs2_clusters_for_bytes(sb, bytes);
return (u64)clusters << cl_bits;
}
static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
u64 bytes)
{
u64 blocks;
blocks = ocfs2_blocks_for_bytes(sb, bytes);
return blocks << sb->s_blocksize_bits;
}
static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
{
return (unsigned long)((bytes + 511) >> 9);
}
static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
unsigned long pg_index)
{
u32 clusters = pg_index;
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
if (unlikely(PAGE_CACHE_SHIFT > cbits))
clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
else if (PAGE_CACHE_SHIFT < cbits)
clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
return clusters;
}
/*
* Find the 1st page index which covers the given clusters.
*/
static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb,
u32 clusters)
{
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
pgoff_t index = clusters;
if (PAGE_CACHE_SHIFT > cbits) {
index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits);
} else if (PAGE_CACHE_SHIFT < cbits) {
index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT);
}
return index;
}
static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
{
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int pages_per_cluster = 1;
if (PAGE_CACHE_SHIFT < cbits)
pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
return pages_per_cluster;
}
static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
unsigned int megs)
{
BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
}
static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
unsigned int clusters)
{
return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
}
static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
{
__set_bit_le(bit, bitmap);
}
#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
{
__clear_bit_le(bit, bitmap);
}
#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
#define ocfs2_test_bit test_bit_le
#define ocfs2_find_next_zero_bit find_next_zero_bit_le
#define ocfs2_find_next_bit find_next_bit_le
static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
{
#if BITS_PER_LONG == 64
*bit += ((unsigned long) addr & 7UL) << 3;
addr = (void *) ((unsigned long) addr & ~7UL);
#elif BITS_PER_LONG == 32
*bit += ((unsigned long) addr & 3UL) << 3;
addr = (void *) ((unsigned long) addr & ~3UL);
#else
#error "how many bits you are?!"
#endif
return addr;
}
static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
{
bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
ocfs2_set_bit(bit, bitmap);
}
static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
{
bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
ocfs2_clear_bit(bit, bitmap);
}
static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
{
bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
return ocfs2_test_bit(bit, bitmap);
}
static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
int start)
{
int fix = 0, ret, tmpmax;
bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
tmpmax = max + fix;
start += fix;
ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
if (ret > max)
return max;
return ret;
}
#endif /* OCFS2_H */

1640
fs/ocfs2/ocfs2_fs.h Normal file

File diff suppressed because it is too large Load diff

242
fs/ocfs2/ocfs2_ioctl.h Normal file
View file

@ -0,0 +1,242 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2_ioctl.h
*
* Defines OCFS2 ioctls.
*
* Copyright (C) 2010 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef OCFS2_IOCTL_H
#define OCFS2_IOCTL_H
/*
* ioctl commands
*/
#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
/*
* Space reservation / allocation / free ioctls and argument structure
* are designed to be compatible with XFS.
*
* ALLOCSP* and FREESP* are not and will never be supported, but are
* included here for completeness.
*/
struct ocfs2_space_resv {
__s16 l_type;
__s16 l_whence;
__s64 l_start;
__s64 l_len; /* len == 0 means until end of file */
__s32 l_sysid;
__u32 l_pid;
__s32 l_pad[4]; /* reserve area */
};
#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
/* Used to pass group descriptor data when online resize is done */
struct ocfs2_new_group_input {
__u64 group; /* Group descriptor's blkno. */
__u32 clusters; /* Total number of clusters in this group */
__u32 frees; /* Total free clusters in this group */
__u16 chain; /* Chain for this group */
__u16 reserved1;
__u32 reserved2;
};
#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
/* Used to pass 2 file names to reflink. */
struct reflink_arguments {
__u64 old_path;
__u64 new_path;
__u64 preserve;
};
#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
/* Following definitions dedicated for ocfs2_info_request ioctls. */
#define OCFS2_INFO_MAX_REQUEST (50)
#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
/* Magic number of all requests */
#define OCFS2_INFO_MAGIC (0x4F32494E)
/*
* Always try to separate info request into small pieces to
* guarantee the backward&forward compatibility.
*/
struct ocfs2_info {
__u64 oi_requests; /* Array of __u64 pointers to requests */
__u32 oi_count; /* Number of requests in info_requests */
__u32 oi_pad;
};
struct ocfs2_info_request {
/*00*/ __u32 ir_magic; /* Magic number */
__u32 ir_code; /* Info request code */
__u32 ir_size; /* Size of request */
__u32 ir_flags; /* Request flags */
/*10*/ /* Request specific fields */
};
struct ocfs2_info_clustersize {
struct ocfs2_info_request ic_req;
__u32 ic_clustersize;
__u32 ic_pad;
};
struct ocfs2_info_blocksize {
struct ocfs2_info_request ib_req;
__u32 ib_blocksize;
__u32 ib_pad;
};
struct ocfs2_info_maxslots {
struct ocfs2_info_request im_req;
__u32 im_max_slots;
__u32 im_pad;
};
struct ocfs2_info_label {
struct ocfs2_info_request il_req;
__u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
} __attribute__ ((packed));
struct ocfs2_info_uuid {
struct ocfs2_info_request iu_req;
__u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
} __attribute__ ((packed));
struct ocfs2_info_fs_features {
struct ocfs2_info_request if_req;
__u32 if_compat_features;
__u32 if_incompat_features;
__u32 if_ro_compat_features;
__u32 if_pad;
};
struct ocfs2_info_journal_size {
struct ocfs2_info_request ij_req;
__u64 ij_journal_size;
};
struct ocfs2_info_freeinode {
struct ocfs2_info_request ifi_req;
struct ocfs2_info_local_freeinode {
__u64 lfi_total;
__u64 lfi_free;
} ifi_stat[OCFS2_MAX_SLOTS];
__u32 ifi_slotnum; /* out */
__u32 ifi_pad;
};
#define OCFS2_INFO_MAX_HIST (32)
struct ocfs2_info_freefrag {
struct ocfs2_info_request iff_req;
struct ocfs2_info_freefrag_stats { /* (out) */
struct ocfs2_info_free_chunk_list {
__u32 fc_chunks[OCFS2_INFO_MAX_HIST];
__u32 fc_clusters[OCFS2_INFO_MAX_HIST];
} ffs_fc_hist;
__u32 ffs_clusters;
__u32 ffs_free_clusters;
__u32 ffs_free_chunks;
__u32 ffs_free_chunks_real;
__u32 ffs_min; /* Minimum free chunksize in clusters */
__u32 ffs_max;
__u32 ffs_avg;
__u32 ffs_pad;
} iff_ffs;
__u32 iff_chunksize; /* chunksize in clusters(in) */
__u32 iff_pad;
};
/* Codes for ocfs2_info_request */
enum ocfs2_info_type {
OCFS2_INFO_CLUSTERSIZE = 1,
OCFS2_INFO_BLOCKSIZE,
OCFS2_INFO_MAXSLOTS,
OCFS2_INFO_LABEL,
OCFS2_INFO_UUID,
OCFS2_INFO_FS_FEATURES,
OCFS2_INFO_JOURNAL_SIZE,
OCFS2_INFO_FREEINODE,
OCFS2_INFO_FREEFRAG,
OCFS2_INFO_NUM_TYPES
};
/* Flags for struct ocfs2_info_request */
/* Filled by the caller */
#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
required. This is a hint.
It is up to ocfs2 whether
the request can be fulfilled
without locking. */
/* Filled by ocfs2 */
#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
this request and
filled in the answer */
#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
request handling. */
#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
struct ocfs2_move_extents {
/* All values are in bytes */
/* in */
__u64 me_start; /* Virtual start in the file to move */
__u64 me_len; /* Length of the extents to be moved */
__u64 me_goal; /* Physical offset of the goal,
it's in block unit */
__u64 me_threshold; /* Maximum distance from goal or threshold
for auto defragmentation */
__u64 me_flags; /* Flags for the operation:
* - auto defragmentation.
* - refcount,xattr cases.
*/
/* out */
__u64 me_moved_len; /* Moved/defraged length */
__u64 me_new_offset; /* Resulting physical location */
__u32 me_reserved[2]; /* Reserved for futhure */
};
#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
claim new clusters
as the goal place
for extents moving */
#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
moving, is to make
movement less likely
to fail, may make fs
even more fragmented */
#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
completely gets done.
*/
#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
#endif /* OCFS2_IOCTL_H */

128
fs/ocfs2/ocfs2_lockid.h Normal file
View file

@ -0,0 +1,128 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2_lockid.h
*
* Defines OCFS2 lockid bits.
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_LOCKID_H
#define OCFS2_LOCKID_H
/* lock ids are made up in the following manner:
* name[0] --> type
* name[1-6] --> 6 pad characters, reserved for now
* name[7-22] --> block number, expressed in hex as 16 chars
* name[23-30] --> i_generation, expressed in hex 8 chars
* name[31] --> '\0' */
#define OCFS2_LOCK_ID_MAX_LEN 32
#define OCFS2_LOCK_ID_PAD "000000"
#define OCFS2_DENTRY_LOCK_INO_START 18
enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_META = 0,
OCFS2_LOCK_TYPE_DATA,
OCFS2_LOCK_TYPE_SUPER,
OCFS2_LOCK_TYPE_RENAME,
OCFS2_LOCK_TYPE_RW,
OCFS2_LOCK_TYPE_DENTRY,
OCFS2_LOCK_TYPE_OPEN,
OCFS2_LOCK_TYPE_FLOCK,
OCFS2_LOCK_TYPE_QINFO,
OCFS2_LOCK_TYPE_NFS_SYNC,
OCFS2_LOCK_TYPE_ORPHAN_SCAN,
OCFS2_LOCK_TYPE_REFCOUNT,
OCFS2_NUM_LOCK_TYPES
};
static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
{
char c;
switch (type) {
case OCFS2_LOCK_TYPE_META:
c = 'M';
break;
case OCFS2_LOCK_TYPE_DATA:
c = 'D';
break;
case OCFS2_LOCK_TYPE_SUPER:
c = 'S';
break;
case OCFS2_LOCK_TYPE_RENAME:
c = 'R';
break;
case OCFS2_LOCK_TYPE_RW:
c = 'W';
break;
case OCFS2_LOCK_TYPE_DENTRY:
c = 'N';
break;
case OCFS2_LOCK_TYPE_OPEN:
c = 'O';
break;
case OCFS2_LOCK_TYPE_FLOCK:
c = 'F';
break;
case OCFS2_LOCK_TYPE_QINFO:
c = 'Q';
break;
case OCFS2_LOCK_TYPE_NFS_SYNC:
c = 'Y';
break;
case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
c = 'P';
break;
case OCFS2_LOCK_TYPE_REFCOUNT:
c = 'T';
break;
default:
c = '\0';
}
return c;
}
static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_META] = "Meta",
[OCFS2_LOCK_TYPE_DATA] = "Data",
[OCFS2_LOCK_TYPE_SUPER] = "Super",
[OCFS2_LOCK_TYPE_RENAME] = "Rename",
/* Need to differntiate from [R]ename.. serializing writes is the
* important job it does, anyway. */
[OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
[OCFS2_LOCK_TYPE_OPEN] = "Open",
[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
[OCFS2_LOCK_TYPE_QINFO] = "Quota",
[OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
[OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
{
#ifdef __KERNEL__
BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
#endif
return ocfs2_lock_type_strings[type];
}
#endif /* OCFS2_LOCKID_H */

View file

@ -0,0 +1,32 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* ocfs2_lockingver.h
*
* Defines OCFS2 Locking version values.
*
* Copyright (C) 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License, version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef OCFS2_LOCKINGVER_H
#define OCFS2_LOCKINGVER_H
/*
* The protocol version for ocfs2 cluster locking. See dlmglue.c for
* more details.
*
* 1.0 - Initial locking version from ocfs2 1.4.
*/
#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
#define OCFS2_LOCKING_PROTOCOL_MINOR 0
#endif /* OCFS2_LOCKINGVER_H */

2768
fs/ocfs2/ocfs2_trace.h Normal file

File diff suppressed because it is too large Load diff

122
fs/ocfs2/quota.h Normal file
View file

@ -0,0 +1,122 @@
/*
* quota.h for OCFS2
*
* On disk quota structures for local and global quota file, in-memory
* structures.
*
*/
#ifndef _OCFS2_QUOTA_H
#define _OCFS2_QUOTA_H
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/quota.h>
#include <linux/list.h>
#include <linux/dqblk_qtree.h>
#include "ocfs2.h"
/* Number of quota types we support */
#define OCFS2_MAXQUOTAS 2
/*
* In-memory structures
*/
struct ocfs2_dquot {
struct dquot dq_dquot; /* Generic VFS dquot */
loff_t dq_local_off; /* Offset in the local quota file */
u64 dq_local_phys_blk; /* Physical block carrying quota structure */
struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
s64 dq_origspace; /* Last globally synced space usage */
s64 dq_originodes; /* Last globally synced inode usage */
struct llist_node list; /* Member of list of dquots to drop */
};
/* Description of one chunk to recover in memory */
struct ocfs2_recovery_chunk {
struct list_head rc_list; /* List of chunks */
int rc_chunk; /* Chunk number */
unsigned long *rc_bitmap; /* Bitmap of entries to recover */
};
struct ocfs2_quota_recovery {
struct list_head r_list[OCFS2_MAXQUOTAS]; /* List of chunks to recover */
};
/* In-memory structure with quota header information */
struct ocfs2_mem_dqinfo {
unsigned int dqi_type; /* Quota type this structure describes */
unsigned int dqi_chunks; /* Number of chunks in local quota file */
unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
unsigned int dqi_syncms; /* How often should we sync with other nodes */
struct list_head dqi_chunk; /* List of chunks */
struct inode *dqi_gqinode; /* Global quota file inode */
struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
u64 dqi_giblk; /* Number of block with global information header */
struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
struct buffer_head *dqi_libh; /* Buffer with local information header */
struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
struct delayed_work dqi_sync_work; /* Work for syncing dquots */
struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
* information, in case we
* enable quotas on file
* needing it */
};
static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
{
return container_of(dquot, struct ocfs2_dquot, dq_dquot);
}
struct ocfs2_quota_chunk {
struct list_head qc_chunk; /* List of quotafile chunks */
int qc_num; /* Number of quota chunk */
struct buffer_head *qc_headerbh; /* Buffer head with chunk header */
};
extern struct kmem_cache *ocfs2_dquot_cachep;
extern struct kmem_cache *ocfs2_qf_chunk_cachep;
extern struct qtree_fmt_operations ocfs2_global_ops;
struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
struct ocfs2_super *osb, int slot_num);
int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
struct ocfs2_quota_recovery *rec,
int slot_num);
void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
ssize_t ocfs2_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
int ocfs2_global_read_info(struct super_block *sb, int type);
int ocfs2_global_write_info(struct super_block *sb, int type);
int ocfs2_global_read_dquot(struct dquot *dquot);
int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
static inline int ocfs2_sync_dquot(struct dquot *dquot)
{
return __ocfs2_sync_dquot(dquot, 0);
}
static inline int ocfs2_global_release_dquot(struct dquot *dquot)
{
return __ocfs2_sync_dquot(dquot, 1);
}
int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
struct buffer_head **bh);
int ocfs2_create_local_dquot(struct dquot *dquot);
int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
int ocfs2_local_write_dquot(struct dquot *dquot);
void ocfs2_drop_dquot_refs(struct work_struct *work);
extern const struct dquot_operations ocfs2_quota_operations;
extern struct quota_format_type ocfs2_quota_format;
#endif /* _OCFS2_QUOTA_H */

971
fs/ocfs2/quota_global.c Normal file
View file

@ -0,0 +1,971 @@
/*
* Implementation of operations over global quota file
*/
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/quota.h>
#include <linux/quotaops.h>
#include <linux/dqblk_qtree.h>
#include <linux/jiffies.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/llist.h>
#include <cluster/masklog.h>
#include "ocfs2_fs.h"
#include "ocfs2.h"
#include "alloc.h"
#include "blockcheck.h"
#include "inode.h"
#include "journal.h"
#include "file.h"
#include "sysfile.h"
#include "dlmglue.h"
#include "uptodate.h"
#include "super.h"
#include "buffer_head_io.h"
#include "quota.h"
#include "ocfs2_trace.h"
/*
* Locking of quotas with OCFS2 is rather complex. Here are rules that
* should be obeyed by all the functions:
* - any write of quota structure (either to local or global file) is protected
* by dqio_mutex or dquot->dq_lock.
* - any modification of global quota file holds inode cluster lock, i_mutex,
* and ip_alloc_sem of the global quota file (achieved by
* ocfs2_lock_global_qf). It also has to hold qinfo_lock.
* - an allocation of new blocks for local quota file is protected by
* its ip_alloc_sem
*
* A rough sketch of locking dependencies (lf = local file, gf = global file):
* Normal filesystem operation:
* start_trans -> dqio_mutex -> write to lf
* Syncing of local and global file:
* ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
* write to gf
* -> write to lf
* Acquire dquot for the first time:
* dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
* -> alloc space for gf
* -> start_trans -> qinfo_lock -> write to gf
* -> ip_alloc_sem of lf -> alloc space for lf
* -> write to lf
* Release last reference to dquot:
* dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
* -> write to lf
* Note that all the above operations also hold the inode cluster lock of lf.
* Recovery:
* inode cluster lock of recovered lf
* -> read bitmaps -> ip_alloc_sem of lf
* -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
* write to gf
*/
static void qsync_work_fn(struct work_struct *work);
static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
{
struct ocfs2_global_disk_dqblk *d = dp;
struct mem_dqblk *m = &dquot->dq_dqb;
/* Update from disk only entries not set by the admin */
if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
}
if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
}
if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
m->dqb_btime = le64_to_cpu(d->dqb_btime);
if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
m->dqb_itime = le64_to_cpu(d->dqb_itime);
OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
}
static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
{
struct ocfs2_global_disk_dqblk *d = dp;
struct mem_dqblk *m = &dquot->dq_dqb;
d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
d->dqb_btime = cpu_to_le64(m->dqb_btime);
d->dqb_itime = cpu_to_le64(m->dqb_itime);
d->dqb_pad1 = d->dqb_pad2 = 0;
}
static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
{
struct ocfs2_global_disk_dqblk *d = dp;
struct ocfs2_mem_dqinfo *oinfo =
sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
if (qtree_entry_unused(&oinfo->dqi_gi, dp))
return 0;
return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type,
le32_to_cpu(d->dqb_id)),
dquot->dq_id);
}
struct qtree_fmt_operations ocfs2_global_ops = {
.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
.disk2mem_dqblk = ocfs2_global_disk2memdqb,
.is_id = ocfs2_global_is_id,
};
int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
{
struct ocfs2_disk_dqtrailer *dqt =
ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
/*
* If the ecc fails, we return the error but otherwise
* leave the filesystem running. We know any error is
* local to this block.
*/
return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
}
int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
struct buffer_head **bhp)
{
int rc;
*bhp = NULL;
rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
ocfs2_validate_quota_block);
if (rc)
mlog_errno(rc);
return rc;
}
/* Read data from global quotafile - avoid pagecache and such because we cannot
* afford acquiring the locks... We use quota cluster lock to serialize
* operations. Caller is responsible for acquiring it. */
ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off)
{
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
struct inode *gqinode = oinfo->dqi_gqinode;
loff_t i_size = i_size_read(gqinode);
int offset = off & (sb->s_blocksize - 1);
sector_t blk = off >> sb->s_blocksize_bits;
int err = 0;
struct buffer_head *bh;
size_t toread, tocopy;
u64 pblock = 0, pcount = 0;
if (off > i_size)
return 0;
if (off + len > i_size)
len = i_size - off;
toread = len;
while (toread > 0) {
tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
if (!pcount) {
err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
&pcount, NULL);
if (err) {
mlog_errno(err);
return err;
}
} else {
pcount--;
pblock++;
}
bh = NULL;
err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
if (err) {
mlog_errno(err);
return err;
}
memcpy(data, bh->b_data + offset, tocopy);
brelse(bh);
offset = 0;
toread -= tocopy;
data += tocopy;
blk++;
}
return len;
}
/* Write to quotafile (we know the transaction is already started and has
* enough credits) */
ssize_t ocfs2_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off)
{
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
struct inode *gqinode = oinfo->dqi_gqinode;
int offset = off & (sb->s_blocksize - 1);
sector_t blk = off >> sb->s_blocksize_bits;
int err = 0, new = 0, ja_type;
struct buffer_head *bh = NULL;
handle_t *handle = journal_current_handle();
u64 pblock, pcount;
if (!handle) {
mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
"because transaction was not started.\n",
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
WARN_ON(1);
len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
}
if (i_size_read(gqinode) < off + len) {
loff_t rounded_end =
ocfs2_align_bytes_to_blocks(sb, off + len);
/* Space is already allocated in ocfs2_acquire_dquot() */
err = ocfs2_simple_size_update(gqinode,
oinfo->dqi_gqi_bh,
rounded_end);
if (err < 0)
goto out;
new = 1;
}
err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
if (err) {
mlog_errno(err);
goto out;
}
/* Not rewriting whole block? */
if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
!new) {
err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
} else {
bh = sb_getblk(sb, pblock);
if (!bh)
err = -ENOMEM;
ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
}
if (err) {
mlog_errno(err);
goto out;
}
lock_buffer(bh);
if (new)
memset(bh->b_data, 0, sb->s_blocksize);
memcpy(bh->b_data + offset, data, len);
flush_dcache_page(bh->b_page);
set_buffer_uptodate(bh);
unlock_buffer(bh);
ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh,
ja_type);
if (err < 0) {
brelse(bh);
goto out;
}
ocfs2_journal_dirty(handle, bh);
brelse(bh);
out:
if (err) {
mlog_errno(err);
return err;
}
gqinode->i_version++;
ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
return len;
}
int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
{
int status;
struct buffer_head *bh = NULL;
status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
if (status < 0)
return status;
spin_lock(&dq_data_lock);
if (!oinfo->dqi_gqi_count++)
oinfo->dqi_gqi_bh = bh;
else
WARN_ON(bh != oinfo->dqi_gqi_bh);
spin_unlock(&dq_data_lock);
if (ex) {
mutex_lock(&oinfo->dqi_gqinode->i_mutex);
down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
} else {
down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
}
return 0;
}
void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
{
if (ex) {
up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
} else {
up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
}
ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
brelse(oinfo->dqi_gqi_bh);
spin_lock(&dq_data_lock);
if (!--oinfo->dqi_gqi_count)
oinfo->dqi_gqi_bh = NULL;
spin_unlock(&dq_data_lock);
}
/* Read information header from global quota file */
int ocfs2_global_read_info(struct super_block *sb, int type)
{
struct inode *gqinode = NULL;
unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
GROUP_QUOTA_SYSTEM_INODE };
struct ocfs2_global_disk_dqinfo dinfo;
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
u64 pcount;
int status;
/* Read global header */
gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
OCFS2_INVALID_SLOT);
if (!gqinode) {
mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
type);
status = -EINVAL;
goto out_err;
}
oinfo->dqi_gi.dqi_sb = sb;
oinfo->dqi_gi.dqi_type = type;
ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
oinfo->dqi_gqi_bh = NULL;
oinfo->dqi_gqi_count = 0;
oinfo->dqi_gqinode = gqinode;
status = ocfs2_lock_global_qf(oinfo, 0);
if (status < 0) {
mlog_errno(status);
goto out_err;
}
status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
&pcount, NULL);
if (status < 0)
goto out_unlock;
status = ocfs2_qinfo_lock(oinfo, 0);
if (status < 0)
goto out_unlock;
status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
sizeof(struct ocfs2_global_disk_dqinfo),
OCFS2_GLOBAL_INFO_OFF);
ocfs2_qinfo_unlock(oinfo, 0);
ocfs2_unlock_global_qf(oinfo, 0);
if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
status);
if (status >= 0)
status = -EIO;
mlog_errno(status);
goto out_err;
}
info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
OCFS2_QBLK_RESERVED_SPACE;
oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
schedule_delayed_work(&oinfo->dqi_sync_work,
msecs_to_jiffies(oinfo->dqi_syncms));
out_err:
return status;
out_unlock:
ocfs2_unlock_global_qf(oinfo, 0);
mlog_errno(status);
goto out_err;
}
/* Write information to global quota file. Expects exlusive lock on quota
* file inode and quota info */
static int __ocfs2_global_write_info(struct super_block *sb, int type)
{
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
struct ocfs2_global_disk_dqinfo dinfo;
ssize_t size;
spin_lock(&dq_data_lock);
info->dqi_flags &= ~DQF_INFO_DIRTY;
dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
spin_unlock(&dq_data_lock);
dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
sizeof(struct ocfs2_global_disk_dqinfo),
OCFS2_GLOBAL_INFO_OFF);
if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
mlog(ML_ERROR, "Cannot write global quota info structure\n");
if (size >= 0)
size = -EIO;
return size;
}
return 0;
}
int ocfs2_global_write_info(struct super_block *sb, int type)
{
int err;
struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
err = ocfs2_qinfo_lock(info, 1);
if (err < 0)
return err;
err = __ocfs2_global_write_info(sb, type);
ocfs2_qinfo_unlock(info, 1);
return err;
}
static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
{
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
/*
* We may need to allocate tree blocks and a leaf block but not the
* root block
*/
return oinfo->dqi_gi.dqi_qtree_depth;
}
static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
{
/* We modify all the allocated blocks, tree root, info block and
* the inode */
return (ocfs2_global_qinit_alloc(sb, type) + 2) *
OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
}
/* Sync local information about quota modifications with global quota file.
* Caller must have started the transaction and obtained exclusive lock for
* global quota file inode */
int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
{
int err, err2;
struct super_block *sb = dquot->dq_sb;
int type = dquot->dq_id.type;
struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
struct ocfs2_global_disk_dqblk dqblk;
s64 spacechange, inodechange;
time_t olditime, oldbtime;
err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
sizeof(struct ocfs2_global_disk_dqblk),
dquot->dq_off);
if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
if (err >= 0) {
mlog(ML_ERROR, "Short read from global quota file "
"(%u read)\n", err);
err = -EIO;
}
goto out;
}
/* Update space and inode usage. Get also other information from
* global quota file so that we don't overwrite any changes there.
* We are */
spin_lock(&dq_data_lock);
spacechange = dquot->dq_dqb.dqb_curspace -
OCFS2_DQUOT(dquot)->dq_origspace;
inodechange = dquot->dq_dqb.dqb_curinodes -
OCFS2_DQUOT(dquot)->dq_originodes;
olditime = dquot->dq_dqb.dqb_itime;
oldbtime = dquot->dq_dqb.dqb_btime;
ocfs2_global_disk2memdqb(dquot, &dqblk);
trace_ocfs2_sync_dquot(from_kqid(&init_user_ns, dquot->dq_id),
dquot->dq_dqb.dqb_curspace,
(long long)spacechange,
dquot->dq_dqb.dqb_curinodes,
(long long)inodechange);
if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
dquot->dq_dqb.dqb_curspace += spacechange;
if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
dquot->dq_dqb.dqb_curinodes += inodechange;
/* Set properly space grace time... */
if (dquot->dq_dqb.dqb_bsoftlimit &&
dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
oldbtime > 0) {
if (dquot->dq_dqb.dqb_btime > 0)
dquot->dq_dqb.dqb_btime =
min(dquot->dq_dqb.dqb_btime, oldbtime);
else
dquot->dq_dqb.dqb_btime = oldbtime;
}
} else {
dquot->dq_dqb.dqb_btime = 0;
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}
/* Set properly inode grace time... */
if (dquot->dq_dqb.dqb_isoftlimit &&
dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
olditime > 0) {
if (dquot->dq_dqb.dqb_itime > 0)
dquot->dq_dqb.dqb_itime =
min(dquot->dq_dqb.dqb_itime, olditime);
else
dquot->dq_dqb.dqb_itime = olditime;
}
} else {
dquot->dq_dqb.dqb_itime = 0;
clear_bit(DQ_INODES_B, &dquot->dq_flags);
}
/* All information is properly updated, clear the flags */
__clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
__clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
__clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
__clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
__clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
__clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
spin_unlock(&dq_data_lock);
err = ocfs2_qinfo_lock(info, freeing);
if (err < 0) {
mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
" (type=%d, id=%u)\n", dquot->dq_id.type,
(unsigned)from_kqid(&init_user_ns, dquot->dq_id));
goto out;
}
if (freeing)
OCFS2_DQUOT(dquot)->dq_use_count--;
err = qtree_write_dquot(&info->dqi_gi, dquot);
if (err < 0)
goto out_qlock;
if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
err = qtree_release_dquot(&info->dqi_gi, dquot);
if (info_dirty(sb_dqinfo(sb, type))) {
err2 = __ocfs2_global_write_info(sb, type);
if (!err)
err = err2;
}
}
out_qlock:
ocfs2_qinfo_unlock(info, freeing);
out:
if (err < 0)
mlog_errno(err);
return err;
}
/*
* Functions for periodic syncing of dquots with global file
*/
static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
{
handle_t *handle;
struct super_block *sb = dquot->dq_sb;
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
struct ocfs2_super *osb = OCFS2_SB(sb);
int status = 0;
trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id),
dquot->dq_id.type,
type, sb->s_id);
if (type != dquot->dq_id.type)
goto out;
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out_ilock;
}
mutex_lock(&sb_dqopt(sb)->dqio_mutex);
status = ocfs2_sync_dquot(dquot);
if (status < 0)
mlog_errno(status);
/* We have to write local structure as well... */
status = ocfs2_local_write_dquot(dquot);
if (status < 0)
mlog_errno(status);
mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
return status;
}
static void qsync_work_fn(struct work_struct *work)
{
struct ocfs2_mem_dqinfo *oinfo = container_of(work,
struct ocfs2_mem_dqinfo,
dqi_sync_work.work);
struct super_block *sb = oinfo->dqi_gqinode->i_sb;
dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
schedule_delayed_work(&oinfo->dqi_sync_work,
msecs_to_jiffies(oinfo->dqi_syncms));
}
/*
* Wrappers for generic quota functions
*/
static int ocfs2_write_dquot(struct dquot *dquot)
{
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
int status = 0;
trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id),
dquot->dq_id.type);
handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out;
}
mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
status = ocfs2_local_write_dquot(dquot);
mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
ocfs2_commit_trans(osb, handle);
out:
return status;
}
static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
{
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
/*
* We modify tree, leaf block, global info, local chunk header,
* global and local inode; OCFS2_QINFO_WRITE_CREDITS already
* accounts for inode update
*/
return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
OCFS2_QINFO_WRITE_CREDITS +
OCFS2_INODE_UPDATE_CREDITS;
}
void ocfs2_drop_dquot_refs(struct work_struct *work)
{
struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
dquot_drop_work);
struct llist_node *list;
struct ocfs2_dquot *odquot, *next_odquot;
list = llist_del_all(&osb->dquot_drop_list);
llist_for_each_entry_safe(odquot, next_odquot, list, list) {
/* Drop the reference we acquired in ocfs2_dquot_release() */
dqput(&odquot->dq_dquot);
}
}
/*
* Called when the last reference to dquot is dropped. If we are called from
* downconvert thread, we cannot do all the handling here because grabbing
* quota lock could deadlock (the node holding the quota lock could need some
* other cluster lock to proceed but with blocked downconvert thread we cannot
* release any lock).
*/
static int ocfs2_release_dquot(struct dquot *dquot)
{
handle_t *handle;
struct ocfs2_mem_dqinfo *oinfo =
sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
int status = 0;
trace_ocfs2_release_dquot(from_kqid(&init_user_ns, dquot->dq_id),
dquot->dq_id.type);
mutex_lock(&dquot->dq_lock);
/* Check whether we are not racing with some other dqget() */
if (atomic_read(&dquot->dq_count) > 1)
goto out;
/* Running from downconvert thread? Postpone quota processing to wq */
if (current == osb->dc_task) {
/*
* Grab our own reference to dquot and queue it for delayed
* dropping. Quota code rechecks after calling
* ->release_dquot() and won't free dquot structure.
*/
dqgrab(dquot);
/* First entry on list -> queue work */
if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
queue_work(ocfs2_wq, &osb->dquot_drop_work);
goto out;
}
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
handle = ocfs2_start_trans(osb,
ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out_ilock;
}
status = ocfs2_global_release_dquot(dquot);
if (status < 0) {
mlog_errno(status);
goto out_trans;
}
status = ocfs2_local_release_dquot(handle, dquot);
/*
* If we fail here, we cannot do much as global structure is
* already released. So just complain...
*/
if (status < 0)
mlog_errno(status);
/*
* Clear dq_off so that we search for the structure in quota file next
* time we acquire it. The structure might be deleted and reallocated
* elsewhere by another node while our dquot structure is on freelist.
*/
dquot->dq_off = 0;
clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_trans:
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
mutex_unlock(&dquot->dq_lock);
if (status)
mlog_errno(status);
return status;
}
/*
* Read global dquot structure from disk or create it if it does
* not exist. Also update use count of the global structure and
* create structure in node-local quota file.
*/
static int ocfs2_acquire_dquot(struct dquot *dquot)
{
int status = 0, err;
int ex = 0;
struct super_block *sb = dquot->dq_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
int type = dquot->dq_id.type;
struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
struct inode *gqinode = info->dqi_gqinode;
int need_alloc = ocfs2_global_qinit_alloc(sb, type);
handle_t *handle;
trace_ocfs2_acquire_dquot(from_kqid(&init_user_ns, dquot->dq_id),
type);
mutex_lock(&dquot->dq_lock);
/*
* We need an exclusive lock, because we're going to update use count
* and instantiate possibly new dquot structure
*/
status = ocfs2_lock_global_qf(info, 1);
if (status < 0)
goto out;
status = ocfs2_qinfo_lock(info, 0);
if (status < 0)
goto out_dq;
/*
* We always want to read dquot structure from disk because we don't
* know what happened with it while it was on freelist.
*/
status = qtree_read_dquot(&info->dqi_gi, dquot);
ocfs2_qinfo_unlock(info, 0);
if (status < 0)
goto out_dq;
OCFS2_DQUOT(dquot)->dq_use_count++;
OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
if (!dquot->dq_off) { /* No real quota entry? */
ex = 1;
/*
* Add blocks to quota file before we start a transaction since
* locking allocators ranks above a transaction start
*/
WARN_ON(journal_current_handle());
status = ocfs2_extend_no_holes(gqinode, NULL,
i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits),
i_size_read(gqinode));
if (status < 0)
goto out_dq;
}
handle = ocfs2_start_trans(osb,
ocfs2_calc_global_qinit_credits(sb, type));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
goto out_dq;
}
status = ocfs2_qinfo_lock(info, ex);
if (status < 0)
goto out_trans;
status = qtree_write_dquot(&info->dqi_gi, dquot);
if (ex && info_dirty(sb_dqinfo(sb, type))) {
err = __ocfs2_global_write_info(sb, type);
if (!status)
status = err;
}
ocfs2_qinfo_unlock(info, ex);
out_trans:
ocfs2_commit_trans(osb, handle);
out_dq:
ocfs2_unlock_global_qf(info, 1);
if (status < 0)
goto out;
status = ocfs2_create_local_dquot(dquot);
if (status < 0)
goto out;
set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out:
mutex_unlock(&dquot->dq_lock);
if (status)
mlog_errno(status);
return status;
}
static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
{
unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
(1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
(1 << (DQ_LASTSET_B + QIF_INODES_B)) |
(1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
(1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
(1 << (DQ_LASTSET_B + QIF_ITIME_B));
int sync = 0;
int status;
struct super_block *sb = dquot->dq_sb;
int type = dquot->dq_id.type;
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(sb);
trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id),
type);
/* In case user set some limits, sync dquot immediately to global
* quota file so that information propagates quicker */
spin_lock(&dq_data_lock);
if (dquot->dq_flags & mask)
sync = 1;
spin_unlock(&dq_data_lock);
/* This is a slight hack but we can't afford getting global quota
* lock if we already have a transaction started. */
if (!sync || journal_current_handle()) {
status = ocfs2_write_dquot(dquot);
goto out;
}
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out_ilock;
}
mutex_lock(&sb_dqopt(sb)->dqio_mutex);
status = ocfs2_sync_dquot(dquot);
if (status < 0) {
mlog_errno(status);
goto out_dlock;
}
/* Now write updated local dquot structure */
status = ocfs2_local_write_dquot(dquot);
out_dlock:
mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
if (status)
mlog_errno(status);
return status;
}
/* This should happen only after set_dqinfo(). */
static int ocfs2_write_info(struct super_block *sb, int type)
{
handle_t *handle;
int status = 0;
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out_ilock;
}
status = dquot_commit_info(sb, type);
ocfs2_commit_trans(OCFS2_SB(sb), handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
if (status)
mlog_errno(status);
return status;
}
static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
{
struct ocfs2_dquot *dquot =
kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
if (!dquot)
return NULL;
return &dquot->dq_dquot;
}
static void ocfs2_destroy_dquot(struct dquot *dquot)
{
kmem_cache_free(ocfs2_dquot_cachep, dquot);
}
const struct dquot_operations ocfs2_quota_operations = {
/* We never make dquot dirty so .write_dquot is never called */
.acquire_dquot = ocfs2_acquire_dquot,
.release_dquot = ocfs2_release_dquot,
.mark_dirty = ocfs2_mark_dquot_dirty,
.write_info = ocfs2_write_info,
.alloc_dquot = ocfs2_alloc_dquot,
.destroy_dquot = ocfs2_destroy_dquot,
};

1321
fs/ocfs2/quota_local.c Normal file

File diff suppressed because it is too large Load diff

4476
fs/ocfs2/refcounttree.c Normal file

File diff suppressed because it is too large Load diff

118
fs/ocfs2/refcounttree.h Normal file
View file

@ -0,0 +1,118 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* refcounttree.h
*
* Copyright (C) 2009 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef OCFS2_REFCOUNTTREE_H
#define OCFS2_REFCOUNTTREE_H
struct ocfs2_refcount_tree {
struct rb_node rf_node;
u64 rf_blkno;
u32 rf_generation;
struct kref rf_getcnt;
struct rw_semaphore rf_sem;
struct ocfs2_lock_res rf_lockres;
int rf_removed;
/* the following 4 fields are used by caching_info. */
spinlock_t rf_lock;
struct ocfs2_caching_info rf_ci;
struct mutex rf_io_mutex;
struct super_block *rf_sb;
};
void ocfs2_purge_refcount_trees(struct ocfs2_super *osb);
int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
struct ocfs2_refcount_tree **tree,
struct buffer_head **ref_bh);
void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
struct ocfs2_refcount_tree *tree,
int rw);
int ocfs2_decrease_refcount(struct inode *inode,
handle_t *handle, u32 cpos, u32 len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc,
int delete);
int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
u64 refcount_loc,
u64 phys_blkno,
u32 clusters,
int *credits,
int *ref_blocks);
int ocfs2_refcount_cow(struct inode *inode,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos);
typedef int (ocfs2_post_refcount_func)(struct inode *inode,
handle_t *handle,
void *para);
/*
* Some refcount caller need to do more work after we modify the data b-tree
* during refcount operation(including CoW and add refcount flag), and make the
* transaction complete. So it must give us this structure so that we can do it
* within our transaction.
*
*/
struct ocfs2_post_refcount {
int credits; /* credits it need for journal. */
ocfs2_post_refcount_func *func; /* real function. */
void *para;
};
int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
struct ocfs2_caching_info *ref_ci,
struct buffer_head *ref_root_bh,
struct ocfs2_xattr_value_root *xv,
int *meta_add, int *credits);
int ocfs2_refcount_cow_xattr(struct inode *inode,
struct ocfs2_dinode *di,
struct ocfs2_xattr_value_buf *vb,
struct ocfs2_refcount_tree *ref_tree,
struct buffer_head *ref_root_bh,
u32 cpos, u32 write_len,
struct ocfs2_post_refcount *post);
int ocfs2_duplicate_clusters_by_page(handle_t *handle,
struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
int ocfs2_cow_sync_writeback(struct super_block *sb,
struct inode *inode,
u32 cpos, u32 num_clusters);
int ocfs2_add_refcount_flag(struct inode *inode,
struct ocfs2_extent_tree *data_et,
struct ocfs2_caching_info *ref_ci,
struct buffer_head *ref_root_bh,
u32 cpos, u32 p_cluster, u32 num_clusters,
struct ocfs2_cached_dealloc_ctxt *dealloc,
struct ocfs2_post_refcount *post);
int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
int ocfs2_try_remove_refcount_tree(struct inode *inode,
struct buffer_head *di_bh);
int ocfs2_increase_refcount(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
u64 cpos, u32 len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_reflink_ioctl(struct inode *inode,
const char __user *oldname,
const char __user *newname,
bool preserve);
#endif /* OCFS2_REFCOUNTTREE_H */

839
fs/ocfs2/reservations.c Normal file
View file

@ -0,0 +1,839 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* reservations.c
*
* Allocation reservations implementation
*
* Some code borrowed from fs/ext3/balloc.c and is:
*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
*
* The rest is copyright (C) 2010 Novell. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
#include <linux/bitops.h>
#include <linux/list.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "ocfs2_trace.h"
#ifdef CONFIG_OCFS2_DEBUG_FS
#define OCFS2_CHECK_RESERVATIONS
#endif
DEFINE_SPINLOCK(resv_lock);
#define OCFS2_MIN_RESV_WINDOW_BITS 8
#define OCFS2_MAX_RESV_WINDOW_BITS 1024
int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
{
return (osb->osb_resv_level && osb->osb_dir_resv_level);
}
static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv)
{
struct ocfs2_super *osb = resmap->m_osb;
unsigned int bits;
if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
/* 8, 16, 32, 64, 128, 256, 512, 1024 */
bits = 4 << osb->osb_resv_level;
} else {
bits = 4 << osb->osb_dir_resv_level;
}
return bits;
}
static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
{
if (resv->r_len)
return resv->r_start + resv->r_len - 1;
return resv->r_start;
}
static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
{
return !!(resv->r_len == 0);
}
static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
{
if (resmap->m_osb->osb_resv_level == 0)
return 1;
return 0;
}
static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
{
struct ocfs2_super *osb = resmap->m_osb;
struct rb_node *node;
struct ocfs2_alloc_reservation *resv;
int i = 0;
mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
osb->dev_str, resmap->m_bitmap_len);
node = rb_first(&resmap->m_reservations);
while (node) {
resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
"\tlast_len: %u\n", resv->r_start,
ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
resv->r_last_len);
node = rb_next(node);
i++;
}
mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
i = 0;
list_for_each_entry(resv, &resmap->m_lru, r_lru) {
mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
"last_start: %u\tlast_len: %u\n", i, resv->r_start,
ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
resv->r_last_len);
i++;
}
}
#ifdef OCFS2_CHECK_RESERVATIONS
static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
int i,
struct ocfs2_alloc_reservation *resv)
{
char *disk_bitmap = resmap->m_disk_bitmap;
unsigned int start = resv->r_start;
unsigned int end = ocfs2_resv_end(resv);
while (start <= end) {
if (ocfs2_test_bit(start, disk_bitmap)) {
mlog(ML_ERROR,
"reservation %d covers an allocated area "
"starting at bit %u!\n", i, start);
return 1;
}
start++;
}
return 0;
}
static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
{
unsigned int off = 0;
int i = 0;
struct rb_node *node;
struct ocfs2_alloc_reservation *resv;
node = rb_first(&resmap->m_reservations);
while (node) {
resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
if (i > 0 && resv->r_start <= off) {
mlog(ML_ERROR, "reservation %d has bad start off!\n",
i);
goto bad;
}
if (resv->r_len == 0) {
mlog(ML_ERROR, "reservation %d has no length!\n",
i);
goto bad;
}
if (resv->r_start > ocfs2_resv_end(resv)) {
mlog(ML_ERROR, "reservation %d has invalid range!\n",
i);
goto bad;
}
if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
i);
goto bad;
}
if (ocfs2_validate_resmap_bits(resmap, i, resv))
goto bad;
off = ocfs2_resv_end(resv);
node = rb_next(node);
i++;
}
return;
bad:
ocfs2_dump_resv(resmap);
BUG();
}
#else
static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
{
}
#endif
void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
{
memset(resv, 0, sizeof(*resv));
INIT_LIST_HEAD(&resv->r_lru);
}
void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
unsigned int flags)
{
BUG_ON(flags & ~OCFS2_RESV_TYPES);
resv->r_flags |= flags;
}
int ocfs2_resmap_init(struct ocfs2_super *osb,
struct ocfs2_reservation_map *resmap)
{
memset(resmap, 0, sizeof(*resmap));
resmap->m_osb = osb;
resmap->m_reservations = RB_ROOT;
/* m_bitmap_len is initialized to zero by the above memset. */
INIT_LIST_HEAD(&resmap->m_lru);
return 0;
}
static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv)
{
assert_spin_locked(&resv_lock);
if (!list_empty(&resv->r_lru))
list_del_init(&resv->r_lru);
list_add_tail(&resv->r_lru, &resmap->m_lru);
}
static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
{
resv->r_len = 0;
resv->r_start = 0;
}
static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv)
{
if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
list_del_init(&resv->r_lru);
rb_erase(&resv->r_node, &resmap->m_reservations);
resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
}
}
static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv)
{
assert_spin_locked(&resv_lock);
__ocfs2_resv_trunc(resv);
/*
* last_len and last_start no longer make sense if
* we're changing the range of our allocations.
*/
resv->r_last_len = resv->r_last_start = 0;
ocfs2_resv_remove(resmap, resv);
}
/* does nothing if 'resv' is null */
void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv)
{
if (resv) {
spin_lock(&resv_lock);
__ocfs2_resv_discard(resmap, resv);
spin_unlock(&resv_lock);
}
}
static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
{
struct rb_node *node;
struct ocfs2_alloc_reservation *resv;
assert_spin_locked(&resv_lock);
while ((node = rb_last(&resmap->m_reservations)) != NULL) {
resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
__ocfs2_resv_discard(resmap, resv);
}
}
void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
unsigned int clen, char *disk_bitmap)
{
if (ocfs2_resmap_disabled(resmap))
return;
spin_lock(&resv_lock);
ocfs2_resmap_clear_all_resv(resmap);
resmap->m_bitmap_len = clen;
resmap->m_disk_bitmap = disk_bitmap;
spin_unlock(&resv_lock);
}
void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
{
/* Does nothing for now. Keep this around for API symmetry */
}
static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *new)
{
struct rb_root *root = &resmap->m_reservations;
struct rb_node *parent = NULL;
struct rb_node **p = &root->rb_node;
struct ocfs2_alloc_reservation *tmp;
assert_spin_locked(&resv_lock);
trace_ocfs2_resv_insert(new->r_start, new->r_len);
while (*p) {
parent = *p;
tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
if (new->r_start < tmp->r_start) {
p = &(*p)->rb_left;
/*
* This is a good place to check for
* overlapping reservations.
*/
BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
} else if (new->r_start > ocfs2_resv_end(tmp)) {
p = &(*p)->rb_right;
} else {
/* This should never happen! */
mlog(ML_ERROR, "Duplicate reservation window!\n");
BUG();
}
}
rb_link_node(&new->r_node, parent, p);
rb_insert_color(&new->r_node, root);
new->r_flags |= OCFS2_RESV_FLAG_INUSE;
ocfs2_resv_mark_lru(resmap, new);
ocfs2_check_resmap(resmap);
}
/**
* ocfs2_find_resv_lhs() - find the window which contains goal
* @resmap: reservation map to search
* @goal: which bit to search for
*
* If a window containing that goal is not found, we return the window
* which comes before goal. Returns NULL on empty rbtree or no window
* before goal.
*/
static struct ocfs2_alloc_reservation *
ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
{
struct ocfs2_alloc_reservation *resv = NULL;
struct ocfs2_alloc_reservation *prev_resv = NULL;
struct rb_node *node = resmap->m_reservations.rb_node;
assert_spin_locked(&resv_lock);
if (!node)
return NULL;
node = rb_first(&resmap->m_reservations);
while (node) {
resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
break;
/* Check if we overshot the reservation just before goal? */
if (resv->r_start > goal) {
resv = prev_resv;
break;
}
prev_resv = resv;
node = rb_next(node);
}
return resv;
}
/*
* We are given a range within the bitmap, which corresponds to a gap
* inside the reservations tree (search_start, search_len). The range
* can be anything from the whole bitmap, to a gap between
* reservations.
*
* The start value of *rstart is insignificant.
*
* This function searches the bitmap range starting at search_start
* with length search_len for a set of contiguous free bits. We try
* to find up to 'wanted' bits, but can sometimes return less.
*
* Returns the length of allocation, 0 if no free bits are found.
*
* *cstart and *clen will also be populated with the result.
*/
static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
unsigned int wanted,
unsigned int search_start,
unsigned int search_len,
unsigned int *rstart,
unsigned int *rlen)
{
void *bitmap = resmap->m_disk_bitmap;
unsigned int best_start, best_len = 0;
int offset, start, found;
trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
wanted, resmap->m_bitmap_len);
found = best_start = best_len = 0;
start = search_start;
while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
start)) != -1) {
/* Search reached end of the region */
if (offset >= (search_start + search_len))
break;
if (offset == start) {
/* we found a zero */
found++;
/* move start to the next bit to test */
start++;
} else {
/* got a zero after some ones */
found = 1;
start = offset + 1;
}
if (found > best_len) {
best_len = found;
best_start = start - found;
}
if (found >= wanted)
break;
}
if (best_len == 0)
return 0;
if (best_len >= wanted)
best_len = wanted;
*rlen = best_len;
*rstart = best_start;
trace_ocfs2_resmap_find_free_bits_end(best_start, best_len);
return *rlen;
}
static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv,
unsigned int goal, unsigned int wanted)
{
struct rb_root *root = &resmap->m_reservations;
unsigned int gap_start, gap_end, gap_len;
struct ocfs2_alloc_reservation *prev_resv, *next_resv;
struct rb_node *prev, *next;
unsigned int cstart, clen;
unsigned int best_start = 0, best_len = 0;
/*
* Nasty cases to consider:
*
* - rbtree is empty
* - our window should be first in all reservations
* - our window should be last in all reservations
* - need to make sure we don't go past end of bitmap
*/
trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv),
goal, wanted, RB_EMPTY_ROOT(root));
assert_spin_locked(&resv_lock);
if (RB_EMPTY_ROOT(root)) {
/*
* Easiest case - empty tree. We can just take
* whatever window of free bits we want.
*/
clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
resmap->m_bitmap_len - goal,
&cstart, &clen);
/*
* This should never happen - the local alloc window
* will always have free bits when we're called.
*/
BUG_ON(goal == 0 && clen == 0);
if (clen == 0)
return;
resv->r_start = cstart;
resv->r_len = clen;
ocfs2_resv_insert(resmap, resv);
return;
}
prev_resv = ocfs2_find_resv_lhs(resmap, goal);
if (prev_resv == NULL) {
/*
* A NULL here means that the search code couldn't
* find a window that starts before goal.
*
* However, we can take the first window after goal,
* which is also by definition, the leftmost window in
* the entire tree. If we can find free bits in the
* gap between goal and the LHS window, then the
* reservation can safely be placed there.
*
* Otherwise we fall back to a linear search, checking
* the gaps in between windows for a place to
* allocate.
*/
next = rb_first(root);
next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
r_node);
/*
* The search should never return such a window. (see
* comment above
*/
if (next_resv->r_start <= goal) {
mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
goal, next_resv->r_start, next_resv->r_len);
ocfs2_dump_resv(resmap);
BUG();
}
clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
next_resv->r_start - goal,
&cstart, &clen);
if (clen) {
best_len = clen;
best_start = cstart;
if (best_len == wanted)
goto out_insert;
}
prev_resv = next_resv;
next_resv = NULL;
}
trace_ocfs2_resv_find_window_prev(prev_resv->r_start,
ocfs2_resv_end(prev_resv));
prev = &prev_resv->r_node;
/* Now we do a linear search for a window, starting at 'prev_rsv' */
while (1) {
next = rb_next(prev);
if (next) {
next_resv = rb_entry(next,
struct ocfs2_alloc_reservation,
r_node);
gap_start = ocfs2_resv_end(prev_resv) + 1;
gap_end = next_resv->r_start - 1;
gap_len = gap_end - gap_start + 1;
} else {
/*
* We're at the rightmost edge of the
* tree. See if a reservation between this
* window and the end of the bitmap will work.
*/
gap_start = ocfs2_resv_end(prev_resv) + 1;
gap_len = resmap->m_bitmap_len - gap_start;
gap_end = resmap->m_bitmap_len - 1;
}
trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
next ? ocfs2_resv_end(next_resv) : -1);
/*
* No need to check this gap if we have already found
* a larger region of free bits.
*/
if (gap_len <= best_len)
goto next_resv;
clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
gap_len, &cstart, &clen);
if (clen == wanted) {
best_len = clen;
best_start = cstart;
goto out_insert;
} else if (clen > best_len) {
best_len = clen;
best_start = cstart;
}
next_resv:
if (!next)
break;
prev = next;
prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
r_node);
}
out_insert:
if (best_len) {
resv->r_start = best_start;
resv->r_len = best_len;
ocfs2_resv_insert(resmap, resv);
}
}
static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv,
unsigned int wanted)
{
struct ocfs2_alloc_reservation *lru_resv;
int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
unsigned int min_bits;
if (!tmpwindow)
min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
else
min_bits = wanted; /* We at know the temp window will use all
* of these bits */
/*
* Take the first reservation off the LRU as our 'target'. We
* don't try to be smart about it. There might be a case for
* searching based on size but I don't have enough data to be
* sure. --Mark (3/16/2010)
*/
lru_resv = list_first_entry(&resmap->m_lru,
struct ocfs2_alloc_reservation, r_lru);
trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start,
lru_resv->r_len,
ocfs2_resv_end(lru_resv));
/*
* Cannibalize (some or all) of the target reservation and
* feed it to the current window.
*/
if (lru_resv->r_len <= min_bits) {
/*
* Discard completely if size is less than or equal to a
* reasonable threshold - 50% of window bits for non temporary
* windows.
*/
resv->r_start = lru_resv->r_start;
resv->r_len = lru_resv->r_len;
__ocfs2_resv_discard(resmap, lru_resv);
} else {
unsigned int shrink;
if (tmpwindow)
shrink = min_bits;
else
shrink = lru_resv->r_len / 2;
lru_resv->r_len -= shrink;
resv->r_start = ocfs2_resv_end(lru_resv) + 1;
resv->r_len = shrink;
}
trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv),
resv->r_len, resv->r_last_start,
resv->r_last_len);
ocfs2_resv_insert(resmap, resv);
}
static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv,
unsigned int wanted)
{
unsigned int goal = 0;
BUG_ON(!ocfs2_resv_empty(resv));
/*
* Begin by trying to get a window as close to the previous
* one as possible. Using the most recent allocation as a
* start goal makes sense.
*/
if (resv->r_last_len) {
goal = resv->r_last_start + resv->r_last_len;
if (goal >= resmap->m_bitmap_len)
goal = 0;
}
__ocfs2_resv_find_window(resmap, resv, goal, wanted);
/* Search from last alloc didn't work, try once more from beginning. */
if (ocfs2_resv_empty(resv) && goal != 0)
__ocfs2_resv_find_window(resmap, resv, 0, wanted);
if (ocfs2_resv_empty(resv)) {
/*
* Still empty? Pull oldest one off the LRU, remove it from
* tree, put this one in it's place.
*/
ocfs2_cannibalize_resv(resmap, resv, wanted);
}
BUG_ON(ocfs2_resv_empty(resv));
}
int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv,
int *cstart, int *clen)
{
if (resv == NULL || ocfs2_resmap_disabled(resmap))
return -ENOSPC;
spin_lock(&resv_lock);
if (ocfs2_resv_empty(resv)) {
/*
* We don't want to over-allocate for temporary
* windows. Otherwise, we run the risk of fragmenting the
* allocation space.
*/
unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
wanted = *clen;
/*
* Try to get a window here. If it works, we must fall
* through and test the bitmap . This avoids some
* ping-ponging of windows due to non-reserved space
* being allocation before we initialize a window for
* that inode.
*/
ocfs2_resv_find_window(resmap, resv, wanted);
trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len);
}
BUG_ON(ocfs2_resv_empty(resv));
*cstart = resv->r_start;
*clen = resv->r_len;
spin_unlock(&resv_lock);
return 0;
}
static void
ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv,
unsigned int start, unsigned int end)
{
unsigned int rhs = 0;
unsigned int old_end = ocfs2_resv_end(resv);
BUG_ON(start != resv->r_start || old_end < end);
/*
* Completely used? We can remove it then.
*/
if (old_end == end) {
__ocfs2_resv_discard(resmap, resv);
return;
}
rhs = old_end - end;
/*
* This should have been trapped above.
*/
BUG_ON(rhs == 0);
resv->r_start = end + 1;
resv->r_len = old_end - resv->r_start + 1;
}
void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv,
u32 cstart, u32 clen)
{
unsigned int cend = cstart + clen - 1;
if (resmap == NULL || ocfs2_resmap_disabled(resmap))
return;
if (resv == NULL)
return;
BUG_ON(cstart != resv->r_start);
spin_lock(&resv_lock);
trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start,
ocfs2_resv_end(resv), resv->r_len,
resv->r_last_start,
resv->r_last_len);
BUG_ON(cstart < resv->r_start);
BUG_ON(cstart > ocfs2_resv_end(resv));
BUG_ON(cend > ocfs2_resv_end(resv));
ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
resv->r_last_start = cstart;
resv->r_last_len = clen;
/*
* May have been discarded above from
* ocfs2_adjust_resv_from_alloc().
*/
if (!ocfs2_resv_empty(resv))
ocfs2_resv_mark_lru(resmap, resv);
trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv),
resv->r_len, resv->r_last_start,
resv->r_last_len);
ocfs2_check_resmap(resmap);
spin_unlock(&resv_lock);
}

159
fs/ocfs2/reservations.h Normal file
View file

@ -0,0 +1,159 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* reservations.h
*
* Allocation reservations function prototypes and structures.
*
* Copyright (C) 2010 Novell. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef OCFS2_RESERVATIONS_H
#define OCFS2_RESERVATIONS_H
#include <linux/rbtree.h>
#define OCFS2_DEFAULT_RESV_LEVEL 2
#define OCFS2_MAX_RESV_LEVEL 9
#define OCFS2_MIN_RESV_LEVEL 0
struct ocfs2_alloc_reservation {
struct rb_node r_node;
unsigned int r_start; /* Beginning of current window */
unsigned int r_len; /* Length of the window */
unsigned int r_last_len; /* Length of most recent alloc */
unsigned int r_last_start; /* Start of most recent alloc */
struct list_head r_lru; /* LRU list head */
unsigned int r_flags;
};
#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */
#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be
* destroyed immedately after use */
#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed
* directory btree */
struct ocfs2_reservation_map {
struct rb_root m_reservations;
char *m_disk_bitmap;
struct ocfs2_super *m_osb;
/* The following are not initialized to meaningful values until a disk
* bitmap is provided. */
u32 m_bitmap_len; /* Number of valid
* bits available */
struct list_head m_lru; /* LRU of reservations
* structures. */
};
void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
unsigned int flags);
int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
/**
* ocfs2_resv_discard() - truncate a reservation
* @resmap:
* @resv: the reservation to truncate.
*
* After this function is called, the reservation will be empty, and
* unlinked from the rbtree.
*/
void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv);
/**
* ocfs2_resmap_init() - Initialize fields of a reservations bitmap
* @resmap: struct ocfs2_reservation_map to initialize
* @obj: unused for now
* @ops: unused for now
* @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
*
* Only possible return value other than '0' is -ENOMEM for failure to
* allocation mirror bitmap.
*/
int ocfs2_resmap_init(struct ocfs2_super *osb,
struct ocfs2_reservation_map *resmap);
/**
* ocfs2_resmap_restart() - "restart" a reservation bitmap
* @resmap: reservations bitmap
* @clen: Number of valid bits in the bitmap
* @disk_bitmap: the disk bitmap this resmap should refer to.
*
* Re-initialize the parameters of a reservation bitmap. This is
* useful for local alloc window slides.
*
* This function will call ocfs2_trunc_resv against all existing
* reservations. A future version will recalculate existing
* reservations based on the new bitmap.
*/
void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
unsigned int clen, char *disk_bitmap);
/**
* ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
* @resmap: the struct ocfs2_reservation_map to uninitialize
*/
void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
/**
* ocfs2_resmap_resv_bits() - Return still-valid reservation bits
* @resmap: reservations bitmap
* @resv: reservation to base search from
* @cstart: start of proposed allocation
* @clen: length (in clusters) of proposed allocation
*
* Using the reservation data from resv, this function will compare
* resmap and resmap->m_disk_bitmap to determine what part (if any) of
* the reservation window is still clear to use. If resv is empty,
* this function will try to allocate a window for it.
*
* On success, zero is returned and the valid allocation area is set in cstart
* and clen.
*
* Returns -ENOSPC if reservations are disabled.
*/
int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv,
int *cstart, int *clen);
/**
* ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
* @resmap: reservations bitmap
* @resv: optional reservation to recalulate based on new bitmap
* @cstart: start of allocation in clusters
* @clen: end of allocation in clusters.
*
* Tell the reservation code that bits were used to fulfill allocation in
* resmap. The bits don't have to have been part of any existing
* reservation. But we must always call this function when bits are claimed.
* Internally, the reservations code will use this information to mark the
* reservations bitmap. If resv is passed, it's next allocation window will be
* calculated. It also expects that 'cstart' is the same as we passed back
* from ocfs2_resmap_resv_bits().
*/
void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv,
u32 cstart, u32 clen);
#endif /* OCFS2_RESERVATIONS_H */

589
fs/ocfs2/resize.c Normal file
View file

@ -0,0 +1,589 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* resize.c
*
* volume resize.
* Inspired by ext3/resize.c.
*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
#include "inode.h"
#include "journal.h"
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
#include "suballoc.h"
#include "resize.h"
/*
* Check whether there are new backup superblocks exist
* in the last group. If there are some, mark them or clear
* them in the bitmap.
*
* Return how many backups we find in the last group.
*/
static u16 ocfs2_calc_new_backup_super(struct inode *inode,
struct ocfs2_group_desc *gd,
u16 cl_cpg,
int set)
{
int i;
u16 backups = 0;
u32 cluster;
u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
gd_blkno = ocfs2_which_cluster_group(inode, cluster);
if (gd_blkno < lgd_blkno)
continue;
else if (gd_blkno > lgd_blkno)
break;
if (set)
ocfs2_set_bit(cluster % cl_cpg,
(unsigned long *)gd->bg_bitmap);
else
ocfs2_clear_bit(cluster % cl_cpg,
(unsigned long *)gd->bg_bitmap);
backups++;
}
return backups;
}
static int ocfs2_update_last_group_and_inode(handle_t *handle,
struct inode *bm_inode,
struct buffer_head *bm_bh,
struct buffer_head *group_bh,
u32 first_new_cluster,
int new_clusters)
{
int ret = 0;
struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
struct ocfs2_chain_list *cl = &fe->id2.i_chain;
struct ocfs2_chain_rec *cr;
struct ocfs2_group_desc *group;
u16 chain, num_bits, backups = 0;
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
trace_ocfs2_update_last_group_and_inode(new_clusters,
first_new_cluster);
ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
group = (struct ocfs2_group_desc *)group_bh->b_data;
/* update the group first. */
num_bits = new_clusters * cl_bpc;
le16_add_cpu(&group->bg_bits, num_bits);
le16_add_cpu(&group->bg_free_bits_count, num_bits);
/*
* check whether there are some new backup superblocks exist in
* this group and update the group bitmap accordingly.
*/
if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
backups = ocfs2_calc_new_backup_super(bm_inode,
group,
cl_cpg, 1);
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
ocfs2_journal_dirty(handle, group_bh);
/* update the inode accordingly. */
ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_rollback;
}
chain = le16_to_cpu(group->bg_chain);
cr = (&cl->cl_recs[chain]);
le32_add_cpu(&cr->c_total, num_bits);
le32_add_cpu(&cr->c_free, num_bits);
le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
le32_add_cpu(&fe->i_clusters, new_clusters);
if (backups) {
le32_add_cpu(&cr->c_free, -1 * backups);
le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
}
spin_lock(&OCFS2_I(bm_inode)->ip_lock);
OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
i_size_write(bm_inode, le64_to_cpu(fe->i_size));
ocfs2_journal_dirty(handle, bm_bh);
out_rollback:
if (ret < 0) {
ocfs2_calc_new_backup_super(bm_inode,
group,
cl_cpg, 0);
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
}
out:
if (ret)
mlog_errno(ret);
return ret;
}
static int update_backups(struct inode * inode, u32 clusters, char *data)
{
int i, ret = 0;
u32 cluster;
u64 blkno;
struct buffer_head *backup = NULL;
struct ocfs2_dinode *backup_di = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
/* calculate the real backups we need to update. */
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
if (cluster > clusters)
break;
ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
if (ret < 0) {
mlog_errno(ret);
break;
}
memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
backup_di = (struct ocfs2_dinode *)backup->b_data;
backup_di->i_blkno = cpu_to_le64(blkno);
ret = ocfs2_write_super_or_backup(osb, backup);
brelse(backup);
backup = NULL;
if (ret < 0) {
mlog_errno(ret);
break;
}
}
return ret;
}
static void ocfs2_update_super_and_backups(struct inode *inode,
int new_clusters)
{
int ret;
u32 clusters = 0;
struct buffer_head *super_bh = NULL;
struct ocfs2_dinode *super_di = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
/*
* update the superblock last.
* It doesn't matter if the write failed.
*/
ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
&super_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
super_di = (struct ocfs2_dinode *)super_bh->b_data;
le32_add_cpu(&super_di->i_clusters, new_clusters);
clusters = le32_to_cpu(super_di->i_clusters);
ret = ocfs2_write_super_or_backup(osb, super_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
ret = update_backups(inode, clusters, super_bh->b_data);
out:
brelse(super_bh);
if (ret)
printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
" during fs resize. This condition is not fatal,"
" but fsck.ocfs2 should be run to fix it\n",
osb->dev_str);
return;
}
/*
* Extend the filesystem to the new number of clusters specified. This entry
* point is only used to extend the current filesystem to the end of the last
* existing group.
*/
int ocfs2_group_extend(struct inode * inode, int new_clusters)
{
int ret;
handle_t *handle;
struct buffer_head *main_bm_bh = NULL;
struct buffer_head *group_bh = NULL;
struct inode *main_bm_inode = NULL;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_group_desc *group = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u16 cl_bpc;
u32 first_new_cluster;
u64 lgd_blkno;
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
if (new_clusters < 0)
return -EINVAL;
else if (new_clusters == 0)
return 0;
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!main_bm_inode) {
ret = -EINVAL;
mlog_errno(ret);
goto out;
}
mutex_lock(&main_bm_inode->i_mutex);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (ret < 0) {
mlog_errno(ret);
goto out_mutex;
}
fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
/* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
* so any corruption is a code bug. */
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
ocfs2_group_bitmap_size(osb->sb, 0,
osb->s_feature_incompat) * 8) {
mlog(ML_ERROR, "The disk is too old and small. "
"Force to do offline resize.");
ret = -EINVAL;
goto out_unlock;
}
first_new_cluster = le32_to_cpu(fe->i_clusters);
lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
first_new_cluster - 1);
ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
&group_bh);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
}
group = (struct ocfs2_group_desc *)group_bh->b_data;
cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
ret = -EINVAL;
goto out_unlock;
}
trace_ocfs2_group_extend(
(unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
ret = -EINVAL;
goto out_unlock;
}
/* update the last group descriptor and inode. */
ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
main_bm_bh, group_bh,
first_new_cluster,
new_clusters);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock:
brelse(group_bh);
brelse(main_bm_bh);
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
iput(main_bm_inode);
out:
return ret;
}
static int ocfs2_check_new_group(struct inode *inode,
struct ocfs2_dinode *di,
struct ocfs2_new_group_input *input,
struct buffer_head *group_bh)
{
int ret;
struct ocfs2_group_desc *gd =
(struct ocfs2_group_desc *)group_bh->b_data;
u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
if (ret)
goto out;
ret = -EINVAL;
if (le16_to_cpu(gd->bg_chain) != input->chain)
mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
"while input has %u set.\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
le16_to_cpu(gd->bg_chain), input->chain);
else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
"input has %u clusters set\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
le16_to_cpu(gd->bg_bits), input->clusters);
else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
"but it should have %u set\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
le16_to_cpu(gd->bg_bits),
input->frees * cl_bpc);
else
ret = 0;
out:
return ret;
}
static int ocfs2_verify_group_and_input(struct inode *inode,
struct ocfs2_dinode *di,
struct ocfs2_new_group_input *input,
struct buffer_head *group_bh)
{
u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
u32 total_clusters = le32_to_cpu(di->i_clusters);
int ret = -EINVAL;
if (cluster < total_clusters)
mlog(ML_ERROR, "add a group which is in the current volume.\n");
else if (input->chain >= cl_count)
mlog(ML_ERROR, "input chain exceeds the limit.\n");
else if (next_free != cl_count && next_free != input->chain)
mlog(ML_ERROR,
"the add group should be in chain %u\n", next_free);
else if (total_clusters + input->clusters < total_clusters)
mlog(ML_ERROR, "add group's clusters overflow.\n");
else if (input->clusters > cl_cpg)
mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
else if (input->frees > input->clusters)
mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
else if (total_clusters % cl_cpg != 0)
mlog(ML_ERROR,
"the last group isn't full. Use group extend first.\n");
else if (input->group != ocfs2_which_cluster_group(inode, cluster))
mlog(ML_ERROR, "group blkno is invalid\n");
else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
mlog(ML_ERROR, "group descriptor check failed.\n");
else
ret = 0;
return ret;
}
/* Add a new group descriptor to global_bitmap. */
int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
{
int ret;
handle_t *handle;
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode = NULL;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *group = NULL;
struct ocfs2_chain_list *cl;
struct ocfs2_chain_rec *cr;
u16 cl_bpc;
u64 bg_ptr;
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!main_bm_inode) {
ret = -EINVAL;
mlog_errno(ret);
goto out;
}
mutex_lock(&main_bm_inode->i_mutex);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (ret < 0) {
mlog_errno(ret);
goto out_mutex;
}
fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
ocfs2_group_bitmap_size(osb->sb, 0,
osb->s_feature_incompat) * 8) {
mlog(ML_ERROR, "The disk is too old and small."
" Force to do offline resize.");
ret = -EINVAL;
goto out_unlock;
}
ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
if (ret < 0) {
mlog(ML_ERROR, "Can't read the group descriptor # %llu "
"from the device.", (unsigned long long)input->group);
goto out_unlock;
}
ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh);
ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
if (ret) {
mlog_errno(ret);
goto out_free_group_bh;
}
trace_ocfs2_group_add((unsigned long long)input->group,
input->chain, input->clusters, input->frees);
handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
ret = -EINVAL;
goto out_free_group_bh;
}
cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
cl = &fe->id2.i_chain;
cr = &cl->cl_recs[input->chain];
ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode),
group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
group = (struct ocfs2_group_desc *)group_bh->b_data;
bg_ptr = le64_to_cpu(group->bg_next_group);
group->bg_next_group = cr->c_blkno;
ocfs2_journal_dirty(handle, group_bh);
ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
group->bg_next_group = cpu_to_le64(bg_ptr);
mlog_errno(ret);
goto out_commit;
}
if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
le16_add_cpu(&cl->cl_next_free_rec, 1);
memset(cr, 0, sizeof(struct ocfs2_chain_rec));
}
cr->c_blkno = cpu_to_le64(input->group);
le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
le32_add_cpu(&fe->id1.bitmap1.i_used,
(input->clusters - input->frees) * cl_bpc);
le32_add_cpu(&fe->i_clusters, input->clusters);
ocfs2_journal_dirty(handle, main_bm_bh);
spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
out_commit:
ocfs2_commit_trans(osb, handle);
out_free_group_bh:
brelse(group_bh);
out_unlock:
brelse(main_bm_bh);
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
iput(main_bm_inode);
out:
return ret;
}

32
fs/ocfs2/resize.h Normal file
View file

@ -0,0 +1,32 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* resize.h
*
* Function prototypes
*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_RESIZE_H
#define OCFS2_RESIZE_H
int ocfs2_group_extend(struct inode * inode, int new_clusters);
int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
#endif /* OCFS2_RESIZE_H */

538
fs/ocfs2/slot_map.c Normal file
View file

@ -0,0 +1,538 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* slot_map.c
*
*
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "heartbeat.h"
#include "inode.h"
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
struct ocfs2_slot {
int sl_valid;
unsigned int sl_node_num;
};
struct ocfs2_slot_info {
int si_extended;
int si_slots_per_block;
struct inode *si_inode;
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
struct ocfs2_slot *si_slots;
};
static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
unsigned int node_num);
static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
int slot_num)
{
BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
si->si_slots[slot_num].sl_valid = 0;
}
static void ocfs2_set_slot(struct ocfs2_slot_info *si,
int slot_num, unsigned int node_num)
{
BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
si->si_slots[slot_num].sl_valid = 1;
si->si_slots[slot_num].sl_node_num = node_num;
}
/* This version is for the extended slot map */
static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
{
int b, i, slotno;
struct ocfs2_slot_map_extended *se;
slotno = 0;
for (b = 0; b < si->si_blocks; b++) {
se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
for (i = 0;
(i < si->si_slots_per_block) &&
(slotno < si->si_num_slots);
i++, slotno++) {
if (se->se_slots[i].es_valid)
ocfs2_set_slot(si, slotno,
le32_to_cpu(se->se_slots[i].es_node_num));
else
ocfs2_invalidate_slot(si, slotno);
}
}
}
/*
* Post the slot information on disk into our slot_info struct.
* Must be protected by osb_lock.
*/
static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
{
int i;
struct ocfs2_slot_map *sm;
sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
for (i = 0; i < si->si_num_slots; i++) {
if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
ocfs2_invalidate_slot(si, i);
else
ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
}
}
static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
{
/*
* The slot data will have been refreshed when ocfs2_super_lock
* was taken.
*/
if (si->si_extended)
ocfs2_update_slot_info_extended(si);
else
ocfs2_update_slot_info_old(si);
}
int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
{
int ret;
struct ocfs2_slot_info *si = osb->slot_info;
if (si == NULL)
return 0;
BUG_ON(si->si_blocks == 0);
BUG_ON(si->si_bh == NULL);
trace_ocfs2_refresh_slot_info(si->si_blocks);
/*
* We pass -1 as blocknr because we expect all of si->si_bh to
* be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
* this is not true, the read of -1 (UINT64_MAX) will fail.
*/
ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
if (ret == 0) {
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
spin_unlock(&osb->osb_lock);
}
return ret;
}
/* post the our slot info stuff into it's destination bh and write it
* out. */
static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
int slot_num,
struct buffer_head **bh)
{
int blkind = slot_num / si->si_slots_per_block;
int slotno = slot_num % si->si_slots_per_block;
struct ocfs2_slot_map_extended *se;
BUG_ON(blkind >= si->si_blocks);
se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
if (si->si_slots[slot_num].sl_valid)
se->se_slots[slotno].es_node_num =
cpu_to_le32(si->si_slots[slot_num].sl_node_num);
*bh = si->si_bh[blkind];
}
static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
int slot_num,
struct buffer_head **bh)
{
int i;
struct ocfs2_slot_map *sm;
sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
for (i = 0; i < si->si_num_slots; i++) {
if (si->si_slots[i].sl_valid)
sm->sm_slots[i] =
cpu_to_le16(si->si_slots[i].sl_node_num);
else
sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
}
*bh = si->si_bh[0];
}
static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
struct ocfs2_slot_info *si,
int slot_num)
{
int status;
struct buffer_head *bh;
spin_lock(&osb->osb_lock);
if (si->si_extended)
ocfs2_update_disk_slot_extended(si, slot_num, &bh);
else
ocfs2_update_disk_slot_old(si, slot_num, &bh);
spin_unlock(&osb->osb_lock);
status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
if (status < 0)
mlog_errno(status);
return status;
}
/*
* Calculate how many bytes are needed by the slot map. Returns
* an error if the slot map file is too small.
*/
static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
struct inode *inode,
unsigned long long *bytes)
{
unsigned long long bytes_needed;
if (ocfs2_uses_extended_slot_map(osb)) {
bytes_needed = osb->max_slots *
sizeof(struct ocfs2_extended_slot);
} else {
bytes_needed = osb->max_slots * sizeof(__le16);
}
if (bytes_needed > i_size_read(inode)) {
mlog(ML_ERROR,
"Slot map file is too small! (size %llu, needed %llu)\n",
i_size_read(inode), bytes_needed);
return -ENOSPC;
}
*bytes = bytes_needed;
return 0;
}
/* try to find global node in the slot info. Returns -ENOENT
* if nothing is found. */
static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
unsigned int node_num)
{
int i, ret = -ENOENT;
for(i = 0; i < si->si_num_slots; i++) {
if (si->si_slots[i].sl_valid &&
(node_num == si->si_slots[i].sl_node_num)) {
ret = i;
break;
}
}
return ret;
}
static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
int preferred)
{
int i, ret = -ENOSPC;
if ((preferred >= 0) && (preferred < si->si_num_slots)) {
if (!si->si_slots[preferred].sl_valid) {
ret = preferred;
goto out;
}
}
for(i = 0; i < si->si_num_slots; i++) {
if (!si->si_slots[i].sl_valid) {
ret = i;
break;
}
}
out:
return ret;
}
int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
{
int slot;
struct ocfs2_slot_info *si = osb->slot_info;
spin_lock(&osb->osb_lock);
slot = __ocfs2_node_num_to_slot(si, node_num);
spin_unlock(&osb->osb_lock);
return slot;
}
int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
unsigned int *node_num)
{
struct ocfs2_slot_info *si = osb->slot_info;
assert_spin_locked(&osb->osb_lock);
BUG_ON(slot_num < 0);
BUG_ON(slot_num > osb->max_slots);
if (!si->si_slots[slot_num].sl_valid)
return -ENOENT;
*node_num = si->si_slots[slot_num].sl_node_num;
return 0;
}
static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
{
unsigned int i;
if (si == NULL)
return;
if (si->si_inode)
iput(si->si_inode);
if (si->si_bh) {
for (i = 0; i < si->si_blocks; i++) {
if (si->si_bh[i]) {
brelse(si->si_bh[i]);
si->si_bh[i] = NULL;
}
}
kfree(si->si_bh);
}
kfree(si);
}
int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
{
struct ocfs2_slot_info *si = osb->slot_info;
if (si == NULL)
return 0;
spin_lock(&osb->osb_lock);
ocfs2_invalidate_slot(si, slot_num);
spin_unlock(&osb->osb_lock);
return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
}
static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
struct ocfs2_slot_info *si)
{
int status = 0;
u64 blkno;
unsigned long long blocks, bytes = 0;
unsigned int i;
struct buffer_head *bh;
status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
if (status)
goto bail;
blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
BUG_ON(blocks > UINT_MAX);
si->si_blocks = blocks;
if (!si->si_blocks)
goto bail;
if (si->si_extended)
si->si_slots_per_block =
(osb->sb->s_blocksize /
sizeof(struct ocfs2_extended_slot));
else
si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
/* The size checks above should ensure this */
BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!si->si_bh) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
for (i = 0; i < si->si_blocks; i++) {
status = ocfs2_extent_map_get_blocks(si->si_inode, i,
&blkno, NULL, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
}
trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
bh = NULL; /* Acquire a fresh bh */
status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
}
si->si_bh[i] = bh;
}
bail:
return status;
}
int ocfs2_init_slot_info(struct ocfs2_super *osb)
{
int status;
struct inode *inode = NULL;
struct ocfs2_slot_info *si;
si = kzalloc(sizeof(struct ocfs2_slot_info) +
(sizeof(struct ocfs2_slot) * osb->max_slots),
GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
si->si_slots = (struct ocfs2_slot *)((char *)si +
sizeof(struct ocfs2_slot_info));
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!inode) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
si->si_inode = inode;
status = ocfs2_map_slot_buffers(osb, si);
if (status < 0) {
mlog_errno(status);
goto bail;
}
osb->slot_info = (struct ocfs2_slot_info *)si;
bail:
if (status < 0 && si)
__ocfs2_free_slot_info(si);
return status;
}
void ocfs2_free_slot_info(struct ocfs2_super *osb)
{
struct ocfs2_slot_info *si = osb->slot_info;
osb->slot_info = NULL;
__ocfs2_free_slot_info(si);
}
int ocfs2_find_slot(struct ocfs2_super *osb)
{
int status;
int slot;
struct ocfs2_slot_info *si;
si = osb->slot_info;
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
/* search for ourselves first and take the slot if it already
* exists. Perhaps we need to mark this in a variable for our
* own journal recovery? Possibly not, though we certainly
* need to warn to the user */
slot = __ocfs2_node_num_to_slot(si, osb->node_num);
if (slot < 0) {
/* if no slot yet, then just take 1st available
* one. */
slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
if (slot < 0) {
spin_unlock(&osb->osb_lock);
mlog(ML_ERROR, "no free slots available!\n");
status = -EINVAL;
goto bail;
}
} else
printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
"allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
spin_unlock(&osb->osb_lock);
trace_ocfs2_find_slot(osb->slot_num);
status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
if (status < 0)
mlog_errno(status);
bail:
return status;
}
void ocfs2_put_slot(struct ocfs2_super *osb)
{
int status, slot_num;
struct ocfs2_slot_info *si = osb->slot_info;
if (!si)
return;
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
slot_num = osb->slot_num;
ocfs2_invalidate_slot(si, osb->slot_num);
osb->slot_num = OCFS2_INVALID_SLOT;
spin_unlock(&osb->osb_lock);
status = ocfs2_update_disk_slot(osb, si, slot_num);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bail:
ocfs2_free_slot_info(osb);
}

44
fs/ocfs2/slot_map.h Normal file
View file

@ -0,0 +1,44 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* slotmap.h
*
* description here
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef SLOTMAP_H
#define SLOTMAP_H
int ocfs2_init_slot_info(struct ocfs2_super *osb);
void ocfs2_free_slot_info(struct ocfs2_super *osb);
int ocfs2_find_slot(struct ocfs2_super *osb);
void ocfs2_put_slot(struct ocfs2_super *osb);
int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
unsigned int *node_num);
int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
#endif

449
fs/ocfs2/stack_o2cb.c Normal file
View file

@ -0,0 +1,449 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* stack_o2cb.c
*
* Code which interfaces ocfs2 with the o2cb stack.
*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation, version 2.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/crc32.h>
#include <linux/slab.h>
#include <linux/module.h>
/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
#include <linux/fs.h>
#include "cluster/masklog.h"
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
#include "cluster/tcp.h"
#include "stackglue.h"
struct o2dlm_private {
struct dlm_eviction_cb op_eviction_cb;
};
static struct ocfs2_stack_plugin o2cb_stack;
/* These should be identical */
#if (DLM_LOCK_IV != LKM_IVMODE)
# error Lock modes do not match
#endif
#if (DLM_LOCK_NL != LKM_NLMODE)
# error Lock modes do not match
#endif
#if (DLM_LOCK_CR != LKM_CRMODE)
# error Lock modes do not match
#endif
#if (DLM_LOCK_CW != LKM_CWMODE)
# error Lock modes do not match
#endif
#if (DLM_LOCK_PR != LKM_PRMODE)
# error Lock modes do not match
#endif
#if (DLM_LOCK_PW != LKM_PWMODE)
# error Lock modes do not match
#endif
#if (DLM_LOCK_EX != LKM_EXMODE)
# error Lock modes do not match
#endif
static inline int mode_to_o2dlm(int mode)
{
BUG_ON(mode > LKM_MAXMODE);
return mode;
}
#define map_flag(_generic, _o2dlm) \
if (flags & (_generic)) { \
flags &= ~(_generic); \
o2dlm_flags |= (_o2dlm); \
}
static int flags_to_o2dlm(u32 flags)
{
int o2dlm_flags = 0;
map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
/* map_flag() should have cleared every flag passed in */
BUG_ON(flags != 0);
return o2dlm_flags;
}
#undef map_flag
/*
* Map an o2dlm status to standard errno values.
*
* o2dlm only uses a handful of these, and returns even fewer to the
* caller. Still, we try to assign sane values to each error.
*
* The following value pairs have special meanings to dlmglue, thus
* the right hand side needs to stay unique - never duplicate the
* mapping elsewhere in the table!
*
* DLM_NORMAL: 0
* DLM_NOTQUEUED: -EAGAIN
* DLM_CANCELGRANT: -EBUSY
* DLM_CANCEL: -DLM_ECANCEL
*/
/* Keep in sync with dlmapi.h */
static int status_map[] = {
[DLM_NORMAL] = 0, /* Success */
[DLM_GRANTED] = -EINVAL,
[DLM_DENIED] = -EACCES,
[DLM_DENIED_NOLOCKS] = -EACCES,
[DLM_WORKING] = -EACCES,
[DLM_BLOCKED] = -EINVAL,
[DLM_BLOCKED_ORPHAN] = -EINVAL,
[DLM_DENIED_GRACE_PERIOD] = -EACCES,
[DLM_SYSERR] = -ENOMEM, /* It is what it is */
[DLM_NOSUPPORT] = -EPROTO,
[DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */
[DLM_IVLOCKID] = -EINVAL,
[DLM_SYNC] = -EINVAL,
[DLM_BADTYPE] = -EINVAL,
[DLM_BADRESOURCE] = -EINVAL,
[DLM_MAXHANDLES] = -ENOMEM,
[DLM_NOCLINFO] = -EINVAL,
[DLM_NOLOCKMGR] = -EINVAL,
[DLM_NOPURGED] = -EINVAL,
[DLM_BADARGS] = -EINVAL,
[DLM_VOID] = -EINVAL,
[DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */
[DLM_IVBUFLEN] = -EINVAL,
[DLM_CVTUNGRANT] = -EPERM,
[DLM_BADPARAM] = -EINVAL,
[DLM_VALNOTVALID] = -EINVAL,
[DLM_REJECTED] = -EPERM,
[DLM_ABORT] = -EINVAL,
[DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */
[DLM_IVRESHANDLE] = -EINVAL,
[DLM_DEADLOCK] = -EDEADLK,
[DLM_DENIED_NOASTS] = -EINVAL,
[DLM_FORWARD] = -EINVAL,
[DLM_TIMEOUT] = -ETIMEDOUT,
[DLM_IVGROUPID] = -EINVAL,
[DLM_VERS_CONFLICT] = -EOPNOTSUPP,
[DLM_BAD_DEVICE_PATH] = -ENOENT,
[DLM_NO_DEVICE_PERMISSION] = -EPERM,
[DLM_NO_CONTROL_DEVICE] = -ENOENT,
[DLM_RECOVERING] = -ENOTCONN,
[DLM_MIGRATING] = -ERESTART,
[DLM_MAXSTATS] = -EINVAL,
};
static int dlm_status_to_errno(enum dlm_status status)
{
BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
return status_map[status];
}
static void o2dlm_lock_ast_wrapper(void *astarg)
{
struct ocfs2_dlm_lksb *lksb = astarg;
lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
}
static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
{
struct ocfs2_dlm_lksb *lksb = astarg;
lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
}
static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
{
struct ocfs2_dlm_lksb *lksb = astarg;
int error = dlm_status_to_errno(status);
/*
* In o2dlm, you can get both the lock_ast() for the lock being
* granted and the unlock_ast() for the CANCEL failing. A
* successful cancel sends DLM_NORMAL here. If the
* lock grant happened before the cancel arrived, you get
* DLM_CANCELGRANT.
*
* There's no need for the double-ast. If we see DLM_CANCELGRANT,
* we just ignore it. We expect the lock_ast() to handle the
* granted lock.
*/
if (status == DLM_CANCELGRANT)
return;
lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
}
static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
int mode,
struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
unsigned int namelen)
{
enum dlm_status status;
int o2dlm_mode = mode_to_o2dlm(mode);
int o2dlm_flags = flags_to_o2dlm(flags);
int ret;
status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
o2dlm_flags, name, namelen,
o2dlm_lock_ast_wrapper, lksb,
o2dlm_blocking_ast_wrapper);
ret = dlm_status_to_errno(status);
return ret;
}
static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags)
{
enum dlm_status status;
int o2dlm_flags = flags_to_o2dlm(flags);
int ret;
status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
ret = dlm_status_to_errno(status);
return ret;
}
static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
{
return dlm_status_to_errno(lksb->lksb_o2dlm.status);
}
/*
* o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB
* contents, it will zero out the LVB. Thus the caller can always trust
* the contents.
*/
static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
{
return 1;
}
static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
{
return (void *)(lksb->lksb_o2dlm.lvb);
}
static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
{
dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
}
/*
* Check if this node is heartbeating and is connected to all other
* heartbeating nodes.
*/
static int o2cb_cluster_check(void)
{
u8 node_num;
int i;
unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
node_num = o2nm_this_node();
if (node_num == O2NM_MAX_NODES) {
printk(KERN_ERR "o2cb: This node has not been configured.\n");
return -EINVAL;
}
/*
* o2dlm expects o2net sockets to be created. If not, then
* dlm_join_domain() fails with a stack of errors which are both cryptic
* and incomplete. The idea here is to detect upfront whether we have
* managed to connect to all nodes or not. If not, then list the nodes
* to allow the user to check the configuration (incorrect IP, firewall,
* etc.) Yes, this is racy. But its not the end of the world.
*/
#define O2CB_MAP_STABILIZE_COUNT 60
for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
o2hb_fill_node_map(hbmap, sizeof(hbmap));
if (!test_bit(node_num, hbmap)) {
printk(KERN_ERR "o2cb: %s heartbeat has not been "
"started.\n", (o2hb_global_heartbeat_active() ?
"Global" : "Local"));
return -EINVAL;
}
o2net_fill_node_map(netmap, sizeof(netmap));
/* Force set the current node to allow easy compare */
set_bit(node_num, netmap);
if (!memcmp(hbmap, netmap, sizeof(hbmap)))
return 0;
if (i < O2CB_MAP_STABILIZE_COUNT)
msleep(1000);
}
printk(KERN_ERR "o2cb: This node could not connect to nodes:");
i = -1;
while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
i + 1)) < O2NM_MAX_NODES) {
if (!test_bit(i, netmap))
printk(" %u", i);
}
printk(".\n");
return -ENOTCONN;
}
/*
* Called from the dlm when it's about to evict a node. This is how the
* classic stack signals node death.
*/
static void o2dlm_eviction_cb(int node_num, void *data)
{
struct ocfs2_cluster_connection *conn = data;
printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
node_num, conn->cc_namelen, conn->cc_name);
conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
}
static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
{
int rc = 0;
u32 dlm_key;
struct dlm_ctxt *dlm;
struct o2dlm_private *priv;
struct dlm_protocol_version fs_version;
BUG_ON(conn == NULL);
BUG_ON(conn->cc_proto == NULL);
/* Ensure cluster stack is up and all nodes are connected */
rc = o2cb_cluster_check();
if (rc) {
printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
"before retrying.\n");
goto out;
}
priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
if (!priv) {
rc = -ENOMEM;
goto out_free;
}
/* This just fills the structure in. It is safe to pass conn. */
dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
conn);
conn->cc_private = priv;
/* used by the dlm code to make message headers unique, each
* node in this domain must agree on this. */
dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
fs_version.pv_major = conn->cc_version.pv_major;
fs_version.pv_minor = conn->cc_version.pv_minor;
dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version);
if (IS_ERR(dlm)) {
rc = PTR_ERR(dlm);
mlog_errno(rc);
goto out_free;
}
conn->cc_version.pv_major = fs_version.pv_major;
conn->cc_version.pv_minor = fs_version.pv_minor;
conn->cc_lockspace = dlm;
dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
out_free:
if (rc)
kfree(conn->cc_private);
out:
return rc;
}
static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
{
struct dlm_ctxt *dlm = conn->cc_lockspace;
struct o2dlm_private *priv = conn->cc_private;
dlm_unregister_eviction_cb(&priv->op_eviction_cb);
conn->cc_private = NULL;
kfree(priv);
dlm_unregister_domain(dlm);
conn->cc_lockspace = NULL;
return 0;
}
static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
unsigned int *node)
{
int node_num;
node_num = o2nm_this_node();
if (node_num == O2NM_INVALID_NODE_NUM)
return -ENOENT;
if (node_num >= O2NM_MAX_NODES)
return -EOVERFLOW;
*node = node_num;
return 0;
}
static struct ocfs2_stack_operations o2cb_stack_ops = {
.connect = o2cb_cluster_connect,
.disconnect = o2cb_cluster_disconnect,
.this_node = o2cb_cluster_this_node,
.dlm_lock = o2cb_dlm_lock,
.dlm_unlock = o2cb_dlm_unlock,
.lock_status = o2cb_dlm_lock_status,
.lvb_valid = o2cb_dlm_lvb_valid,
.lock_lvb = o2cb_dlm_lvb,
.dump_lksb = o2cb_dump_lksb,
};
static struct ocfs2_stack_plugin o2cb_stack = {
.sp_name = "o2cb",
.sp_ops = &o2cb_stack_ops,
.sp_owner = THIS_MODULE,
};
static int __init o2cb_stack_init(void)
{
return ocfs2_stack_glue_register(&o2cb_stack);
}
static void __exit o2cb_stack_exit(void)
{
ocfs2_stack_glue_unregister(&o2cb_stack);
}
MODULE_AUTHOR("Oracle");
MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
MODULE_LICENSE("GPL");
module_init(o2cb_stack_init);
module_exit(o2cb_stack_exit);

1136
fs/ocfs2/stack_user.c Normal file

File diff suppressed because it is too large Load diff

748
fs/ocfs2/stackglue.c Normal file
View file

@ -0,0 +1,748 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* stackglue.c
*
* Code which implements an OCFS2 specific interface to underlying
* cluster stacks.
*
* Copyright (C) 2007, 2009 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation, version 2.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/fs.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <linux/sysctl.h>
#include "ocfs2_fs.h"
#include "stackglue.h"
#define OCFS2_STACK_PLUGIN_O2CB "o2cb"
#define OCFS2_STACK_PLUGIN_USER "user"
#define OCFS2_MAX_HB_CTL_PATH 256
static struct ocfs2_protocol_version locking_max_version;
static DEFINE_SPINLOCK(ocfs2_stack_lock);
static LIST_HEAD(ocfs2_stack_list);
static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
/*
* The stack currently in use. If not null, active_stack->sp_count > 0,
* the module is pinned, and the locking protocol cannot be changed.
*/
static struct ocfs2_stack_plugin *active_stack;
static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
{
struct ocfs2_stack_plugin *p;
assert_spin_locked(&ocfs2_stack_lock);
list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
if (!strcmp(p->sp_name, name))
return p;
}
return NULL;
}
static int ocfs2_stack_driver_request(const char *stack_name,
const char *plugin_name)
{
int rc;
struct ocfs2_stack_plugin *p;
spin_lock(&ocfs2_stack_lock);
/*
* If the stack passed by the filesystem isn't the selected one,
* we can't continue.
*/
if (strcmp(stack_name, cluster_stack_name)) {
rc = -EBUSY;
goto out;
}
if (active_stack) {
/*
* If the active stack isn't the one we want, it cannot
* be selected right now.
*/
if (!strcmp(active_stack->sp_name, plugin_name))
rc = 0;
else
rc = -EBUSY;
goto out;
}
p = ocfs2_stack_lookup(plugin_name);
if (!p || !try_module_get(p->sp_owner)) {
rc = -ENOENT;
goto out;
}
active_stack = p;
rc = 0;
out:
/* If we found it, pin it */
if (!rc)
active_stack->sp_count++;
spin_unlock(&ocfs2_stack_lock);
return rc;
}
/*
* This function looks up the appropriate stack and makes it active. If
* there is no stack, it tries to load it. It will fail if the stack still
* cannot be found. It will also fail if a different stack is in use.
*/
static int ocfs2_stack_driver_get(const char *stack_name)
{
int rc;
char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
/*
* Classic stack does not pass in a stack name. This is
* compatible with older tools as well.
*/
if (!stack_name || !*stack_name)
stack_name = OCFS2_STACK_PLUGIN_O2CB;
if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
printk(KERN_ERR
"ocfs2 passed an invalid cluster stack label: \"%s\"\n",
stack_name);
return -EINVAL;
}
/* Anything that isn't the classic stack is a user stack */
if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
plugin_name = OCFS2_STACK_PLUGIN_USER;
rc = ocfs2_stack_driver_request(stack_name, plugin_name);
if (rc == -ENOENT) {
request_module("ocfs2_stack_%s", plugin_name);
rc = ocfs2_stack_driver_request(stack_name, plugin_name);
}
if (rc == -ENOENT) {
printk(KERN_ERR
"ocfs2: Cluster stack driver \"%s\" cannot be found\n",
plugin_name);
} else if (rc == -EBUSY) {
printk(KERN_ERR
"ocfs2: A different cluster stack is in use\n");
}
return rc;
}
static void ocfs2_stack_driver_put(void)
{
spin_lock(&ocfs2_stack_lock);
BUG_ON(active_stack == NULL);
BUG_ON(active_stack->sp_count == 0);
active_stack->sp_count--;
if (!active_stack->sp_count) {
module_put(active_stack->sp_owner);
active_stack = NULL;
}
spin_unlock(&ocfs2_stack_lock);
}
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
{
int rc;
spin_lock(&ocfs2_stack_lock);
if (!ocfs2_stack_lookup(plugin->sp_name)) {
plugin->sp_count = 0;
plugin->sp_max_proto = locking_max_version;
list_add(&plugin->sp_list, &ocfs2_stack_list);
printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
plugin->sp_name);
rc = 0;
} else {
printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
plugin->sp_name);
rc = -EEXIST;
}
spin_unlock(&ocfs2_stack_lock);
return rc;
}
EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
{
struct ocfs2_stack_plugin *p;
spin_lock(&ocfs2_stack_lock);
p = ocfs2_stack_lookup(plugin->sp_name);
if (p) {
BUG_ON(p != plugin);
BUG_ON(plugin == active_stack);
BUG_ON(plugin->sp_count != 0);
list_del_init(&plugin->sp_list);
printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
plugin->sp_name);
} else {
printk(KERN_ERR "Stack \"%s\" is not registered\n",
plugin->sp_name);
}
spin_unlock(&ocfs2_stack_lock);
}
EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
{
struct ocfs2_stack_plugin *p;
spin_lock(&ocfs2_stack_lock);
if (memcmp(max_proto, &locking_max_version,
sizeof(struct ocfs2_protocol_version))) {
BUG_ON(locking_max_version.pv_major != 0);
locking_max_version = *max_proto;
list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
p->sp_max_proto = locking_max_version;
}
}
spin_unlock(&ocfs2_stack_lock);
}
EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
/*
* The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
* for the ast and bast functions. They will pass the lksb to the ast
* and bast. The caller can wrap the lksb with their own structure to
* get more information.
*/
int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
int mode,
struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
unsigned int namelen)
{
if (!lksb->lksb_conn)
lksb->lksb_conn = conn;
else
BUG_ON(lksb->lksb_conn != conn);
return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
name, namelen);
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags)
{
BUG_ON(lksb->lksb_conn == NULL);
return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
{
return active_stack->sp_ops->lock_status(lksb);
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
{
return active_stack->sp_ops->lvb_valid(lksb);
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
{
return active_stack->sp_ops->lock_lvb(lksb);
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
{
active_stack->sp_ops->dump_lksb(lksb);
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
int ocfs2_stack_supports_plocks(void)
{
return active_stack && active_stack->sp_ops->plock;
}
EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
/*
* ocfs2_plock() can only be safely called if
* ocfs2_stack_supports_plocks() returned true
*/
int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
struct file *file, int cmd, struct file_lock *fl)
{
WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
if (active_stack->sp_ops->plock)
return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(ocfs2_plock);
int ocfs2_cluster_connect(const char *stack_name,
const char *cluster_name,
int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
void (*recovery_handler)(int node_num,
void *recovery_data),
void *recovery_data,
struct ocfs2_cluster_connection **conn)
{
int rc = 0;
struct ocfs2_cluster_connection *new_conn;
BUG_ON(group == NULL);
BUG_ON(conn == NULL);
BUG_ON(recovery_handler == NULL);
if (grouplen > GROUP_NAME_MAX) {
rc = -EINVAL;
goto out;
}
if (memcmp(&lproto->lp_max_version, &locking_max_version,
sizeof(struct ocfs2_protocol_version))) {
rc = -EINVAL;
goto out;
}
new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
GFP_KERNEL);
if (!new_conn) {
rc = -ENOMEM;
goto out;
}
strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
new_conn->cc_namelen = grouplen;
if (cluster_name_len)
strlcpy(new_conn->cc_cluster_name, cluster_name,
CLUSTER_NAME_MAX + 1);
new_conn->cc_cluster_name_len = cluster_name_len;
new_conn->cc_recovery_handler = recovery_handler;
new_conn->cc_recovery_data = recovery_data;
new_conn->cc_proto = lproto;
/* Start the new connection at our maximum compatibility level */
new_conn->cc_version = lproto->lp_max_version;
/* This will pin the stack driver if successful */
rc = ocfs2_stack_driver_get(stack_name);
if (rc)
goto out_free;
rc = active_stack->sp_ops->connect(new_conn);
if (rc) {
ocfs2_stack_driver_put();
goto out_free;
}
*conn = new_conn;
out_free:
if (rc)
kfree(new_conn);
out:
return rc;
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
/* The caller will ensure all nodes have the same cluster stack */
int ocfs2_cluster_connect_agnostic(const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
void (*recovery_handler)(int node_num,
void *recovery_data),
void *recovery_data,
struct ocfs2_cluster_connection **conn)
{
char *stack_name = NULL;
if (cluster_stack_name[0])
stack_name = cluster_stack_name;
return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
lproto, recovery_handler, recovery_data,
conn);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
/* If hangup_pending is 0, the stack driver will be dropped */
int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
int hangup_pending)
{
int ret;
BUG_ON(conn == NULL);
ret = active_stack->sp_ops->disconnect(conn);
/* XXX Should we free it anyway? */
if (!ret) {
kfree(conn);
if (!hangup_pending)
ocfs2_stack_driver_put();
}
return ret;
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
/*
* Leave the group for this filesystem. This is executed by a userspace
* program (stored in ocfs2_hb_ctl_path).
*/
static void ocfs2_leave_group(const char *group)
{
int ret;
char *argv[5], *envp[3];
argv[0] = ocfs2_hb_ctl_path;
argv[1] = "-K";
argv[2] = "-u";
argv[3] = (char *)group;
argv[4] = NULL;
/* minimal command environment taken from cpu_run_sbin_hotplug */
envp[0] = "HOME=/";
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[2] = NULL;
ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
if (ret < 0) {
printk(KERN_ERR
"ocfs2: Error %d running user helper "
"\"%s %s %s %s\"\n",
ret, argv[0], argv[1], argv[2], argv[3]);
}
}
/*
* Hangup is a required post-umount. ocfs2-tools software expects the
* filesystem to call "ocfs2_hb_ctl" during unmount. This happens
* regardless of whether the DLM got started, so we can't do it
* in ocfs2_cluster_disconnect(). The ocfs2_leave_group() function does
* the actual work.
*/
void ocfs2_cluster_hangup(const char *group, int grouplen)
{
BUG_ON(group == NULL);
BUG_ON(group[grouplen] != '\0');
ocfs2_leave_group(group);
/* cluster_disconnect() was called with hangup_pending==1 */
ocfs2_stack_driver_put();
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
unsigned int *node)
{
return active_stack->sp_ops->this_node(conn, node);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
/*
* Sysfs bits
*/
static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
ssize_t ret = 0;
spin_lock(&ocfs2_stack_lock);
if (locking_max_version.pv_major)
ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
locking_max_version.pv_major,
locking_max_version.pv_minor);
spin_unlock(&ocfs2_stack_lock);
return ret;
}
static struct kobj_attribute ocfs2_attr_max_locking_protocol =
__ATTR(max_locking_protocol, S_IRUGO,
ocfs2_max_locking_protocol_show, NULL);
static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
struct ocfs2_stack_plugin *p;
spin_lock(&ocfs2_stack_lock);
list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
ret = snprintf(buf, remain, "%s\n",
p->sp_name);
if (ret < 0) {
total = ret;
break;
}
if (ret == remain) {
/* snprintf() didn't fit */
total = -E2BIG;
break;
}
total += ret;
remain -= ret;
}
spin_unlock(&ocfs2_stack_lock);
return total;
}
static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
__ATTR(loaded_cluster_plugins, S_IRUGO,
ocfs2_loaded_cluster_plugins_show, NULL);
static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
ssize_t ret = 0;
spin_lock(&ocfs2_stack_lock);
if (active_stack) {
ret = snprintf(buf, PAGE_SIZE, "%s\n",
active_stack->sp_name);
if (ret == PAGE_SIZE)
ret = -E2BIG;
}
spin_unlock(&ocfs2_stack_lock);
return ret;
}
static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
__ATTR(active_cluster_plugin, S_IRUGO,
ocfs2_active_cluster_plugin_show, NULL);
static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
ssize_t ret;
spin_lock(&ocfs2_stack_lock);
ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
spin_unlock(&ocfs2_stack_lock);
return ret;
}
static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
size_t len = count;
ssize_t ret;
if (len == 0)
return len;
if (buf[len - 1] == '\n')
len--;
if ((len != OCFS2_STACK_LABEL_LEN) ||
(strnlen(buf, len) != len))
return -EINVAL;
spin_lock(&ocfs2_stack_lock);
if (active_stack) {
if (!strncmp(buf, cluster_stack_name, len))
ret = count;
else
ret = -EBUSY;
} else {
memcpy(cluster_stack_name, buf, len);
ret = count;
}
spin_unlock(&ocfs2_stack_lock);
return ret;
}
static struct kobj_attribute ocfs2_attr_cluster_stack =
__ATTR(cluster_stack, S_IRUGO | S_IWUSR,
ocfs2_cluster_stack_show,
ocfs2_cluster_stack_store);
static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return snprintf(buf, PAGE_SIZE, "1\n");
}
static struct kobj_attribute ocfs2_attr_dlm_recover_support =
__ATTR(dlm_recover_callback_support, S_IRUGO,
ocfs2_dlm_recover_show, NULL);
static struct attribute *ocfs2_attrs[] = {
&ocfs2_attr_max_locking_protocol.attr,
&ocfs2_attr_loaded_cluster_plugins.attr,
&ocfs2_attr_active_cluster_plugin.attr,
&ocfs2_attr_cluster_stack.attr,
&ocfs2_attr_dlm_recover_support.attr,
NULL,
};
static struct attribute_group ocfs2_attr_group = {
.attrs = ocfs2_attrs,
};
static struct kset *ocfs2_kset;
static void ocfs2_sysfs_exit(void)
{
kset_unregister(ocfs2_kset);
}
static int ocfs2_sysfs_init(void)
{
int ret;
ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
if (!ocfs2_kset)
return -ENOMEM;
ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
if (ret)
goto error;
return 0;
error:
kset_unregister(ocfs2_kset);
return ret;
}
/*
* Sysctl bits
*
* The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path. The 'nm' doesn't
* make as much sense in a multiple cluster stack world, but it's safer
* and easier to preserve the name.
*/
#define FS_OCFS2_NM 1
static struct ctl_table ocfs2_nm_table[] = {
{
.procname = "hb_ctl_path",
.data = ocfs2_hb_ctl_path,
.maxlen = OCFS2_MAX_HB_CTL_PATH,
.mode = 0644,
.proc_handler = proc_dostring,
},
{ }
};
static struct ctl_table ocfs2_mod_table[] = {
{
.procname = "nm",
.data = NULL,
.maxlen = 0,
.mode = 0555,
.child = ocfs2_nm_table
},
{ }
};
static struct ctl_table ocfs2_kern_table[] = {
{
.procname = "ocfs2",
.data = NULL,
.maxlen = 0,
.mode = 0555,
.child = ocfs2_mod_table
},
{ }
};
static struct ctl_table ocfs2_root_table[] = {
{
.procname = "fs",
.data = NULL,
.maxlen = 0,
.mode = 0555,
.child = ocfs2_kern_table
},
{ }
};
static struct ctl_table_header *ocfs2_table_header;
/*
* Initialization
*/
static int __init ocfs2_stack_glue_init(void)
{
strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
if (!ocfs2_table_header) {
printk(KERN_ERR
"ocfs2 stack glue: unable to register sysctl\n");
return -ENOMEM; /* or something. */
}
return ocfs2_sysfs_init();
}
static void __exit ocfs2_stack_glue_exit(void)
{
memset(&locking_max_version, 0,
sizeof(struct ocfs2_protocol_version));
locking_max_version.pv_major = 0;
locking_max_version.pv_minor = 0;
ocfs2_sysfs_exit();
if (ocfs2_table_header)
unregister_sysctl_table(ocfs2_table_header);
}
MODULE_AUTHOR("Oracle");
MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
MODULE_LICENSE("GPL");
module_init(ocfs2_stack_glue_init);
module_exit(ocfs2_stack_glue_exit);

301
fs/ocfs2/stackglue.h Normal file
View file

@ -0,0 +1,301 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* stackglue.h
*
* Glue to the underlying cluster stack.
*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation, version 2.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef STACKGLUE_H
#define STACKGLUE_H
#include <linux/types.h>
#include <linux/list.h>
#include <linux/dlmconstants.h>
#include "dlm/dlmapi.h"
#include <linux/dlm.h>
/* Needed for plock-related prototypes */
struct file;
struct file_lock;
/*
* dlmconstants.h does not have a LOCAL flag. We hope to remove it
* some day, but right now we need it. Let's fake it. This value is larger
* than any flag in dlmconstants.h.
*/
#define DLM_LKF_LOCAL 0x00100000
/*
* This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably
* wants to be in a public header.
*/
#define GROUP_NAME_MAX 64
/* This shadows OCFS2_CLUSTER_NAME_LEN */
#define CLUSTER_NAME_MAX 16
/*
* ocfs2_protocol_version changes when ocfs2 does something different in
* its inter-node behavior. See dlmglue.c for more information.
*/
struct ocfs2_protocol_version {
u8 pv_major;
u8 pv_minor;
};
/*
* The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
* has a pointer to separately allocated lvb space. This struct exists only to
* include in the lksb union to make space for a combined dlm_lksb and lvb.
*/
struct fsdlm_lksb_plus_lvb {
struct dlm_lksb lksb;
char lvb[DLM_LVB_LEN];
};
/*
* A union of all lock status structures. We define it here so that the
* size of the union is known. Lock status structures are embedded in
* ocfs2 inodes.
*/
struct ocfs2_cluster_connection;
struct ocfs2_dlm_lksb {
union {
struct dlm_lockstatus lksb_o2dlm;
struct dlm_lksb lksb_fsdlm;
struct fsdlm_lksb_plus_lvb padding;
};
struct ocfs2_cluster_connection *lksb_conn;
};
/*
* The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
*/
struct ocfs2_locking_protocol {
struct ocfs2_protocol_version lp_max_version;
void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
};
/*
* A cluster connection. Mostly opaque to ocfs2, the connection holds
* state for the underlying stack. ocfs2 does use cc_version to determine
* locking compatibility.
*/
struct ocfs2_cluster_connection {
char cc_name[GROUP_NAME_MAX + 1];
int cc_namelen;
char cc_cluster_name[CLUSTER_NAME_MAX + 1];
int cc_cluster_name_len;
struct ocfs2_protocol_version cc_version;
struct ocfs2_locking_protocol *cc_proto;
void (*cc_recovery_handler)(int node_num, void *recovery_data);
void *cc_recovery_data;
void *cc_lockspace;
void *cc_private;
};
/*
* Each cluster stack implements the stack operations structure. Not used
* in the ocfs2 code, the stackglue code translates generic cluster calls
* into stack operations.
*/
struct ocfs2_stack_operations {
/*
* The fs code calls ocfs2_cluster_connect() to attach a new
* filesystem to the cluster stack. The ->connect() op is passed
* an ocfs2_cluster_connection with the name and recovery field
* filled in.
*
* The stack must set up any notification mechanisms and create
* the filesystem lockspace in the DLM. The lockspace should be
* stored on cc_lockspace. Any other information can be stored on
* cc_private.
*
* ->connect() must not return until it is guaranteed that
*
* - Node down notifications for the filesystem will be received
* and passed to conn->cc_recovery_handler().
* - Locking requests for the filesystem will be processed.
*/
int (*connect)(struct ocfs2_cluster_connection *conn);
/*
* The fs code calls ocfs2_cluster_disconnect() when a filesystem
* no longer needs cluster services. All DLM locks have been
* dropped, and recovery notification is being ignored by the
* fs code. The stack must disengage from the DLM and discontinue
* recovery notification.
*
* Once ->disconnect() has returned, the connection structure will
* be freed. Thus, a stack must not return from ->disconnect()
* until it will no longer reference the conn pointer.
*
* Once this call returns, the stack glue will be dropping this
* connection's reference on the module.
*/
int (*disconnect)(struct ocfs2_cluster_connection *conn);
/*
* ->this_node() returns the cluster's unique identifier for the
* local node.
*/
int (*this_node)(struct ocfs2_cluster_connection *conn,
unsigned int *node);
/*
* Call the underlying dlm lock function. The ->dlm_lock()
* callback should convert the flags and mode as appropriate.
*
* ast and bast functions are not part of the call because the
* stack will likely want to wrap ast and bast calls before passing
* them to stack->sp_proto. There is no astarg. The lksb will
* be passed back to the ast and bast functions. The caller can
* use this to find their object.
*/
int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
int mode,
struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
unsigned int namelen);
/*
* Call the underlying dlm unlock function. The ->dlm_unlock()
* function should convert the flags as appropriate.
*
* The unlock ast is not passed, as the stack will want to wrap
* it before calling stack->sp_proto->lp_unlock_ast(). There is
* no astarg. The lksb will be passed back to the unlock ast
* function. The caller can use this to find their object.
*/
int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags);
/*
* Return the status of the current lock status block. The fs
* code should never dereference the union. The ->lock_status()
* callback pulls out the stack-specific lksb, converts the status
* to a proper errno, and returns it.
*/
int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
/*
* Return non-zero if the LVB is valid.
*/
int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
/*
* Pull the lvb pointer off of the stack-specific lksb.
*/
void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
/*
* Cluster-aware posix locks
*
* This is NULL for stacks which do not support posix locks.
*/
int (*plock)(struct ocfs2_cluster_connection *conn,
u64 ino,
struct file *file,
int cmd,
struct file_lock *fl);
/*
* This is an optoinal debugging hook. If provided, the
* stack can dump debugging information about this lock.
*/
void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
};
/*
* Each stack plugin must describe itself by registering a
* ocfs2_stack_plugin structure. This is only seen by stackglue and the
* stack driver.
*/
struct ocfs2_stack_plugin {
char *sp_name;
struct ocfs2_stack_operations *sp_ops;
struct module *sp_owner;
/* These are managed by the stackglue code. */
struct list_head sp_list;
unsigned int sp_count;
struct ocfs2_protocol_version sp_max_proto;
};
/* Used by the filesystem */
int ocfs2_cluster_connect(const char *stack_name,
const char *cluster_name,
int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
void (*recovery_handler)(int node_num,
void *recovery_data),
void *recovery_data,
struct ocfs2_cluster_connection **conn);
/*
* Used by callers that don't store their stack name. They must ensure
* all nodes have the same stack.
*/
int ocfs2_cluster_connect_agnostic(const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
void (*recovery_handler)(int node_num,
void *recovery_data),
void *recovery_data,
struct ocfs2_cluster_connection **conn);
int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
int hangup_pending);
void ocfs2_cluster_hangup(const char *group, int grouplen);
int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
unsigned int *node);
struct ocfs2_lock_res;
int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
int mode,
struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
unsigned int namelen);
int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags);
int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
int ocfs2_stack_supports_plocks(void);
int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
struct file *file, int cmd, struct file_lock *fl);
void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
/* Used by stack plugins */
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
#endif /* STACKGLUE_H */

Some files were not shown because too many files have changed in this diff Show more