mirror of
https://github.com/AetherDroid/android_kernel_samsung_on5xelte.git
synced 2025-09-07 00:38:05 -04:00
Fixed MTP to work with TWRP
This commit is contained in:
commit
f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
40
fs/ceph/Kconfig
Normal file
40
fs/ceph/Kconfig
Normal file
|
@ -0,0 +1,40 @@
|
|||
config CEPH_FS
|
||||
tristate "Ceph distributed file system"
|
||||
depends on INET
|
||||
select CEPH_LIB
|
||||
select LIBCRC32C
|
||||
select CRYPTO_AES
|
||||
select CRYPTO
|
||||
default n
|
||||
help
|
||||
Choose Y or M here to include support for mounting the
|
||||
experimental Ceph distributed file system. Ceph is an extremely
|
||||
scalable file system designed to provide high performance,
|
||||
reliable access to petabytes of storage.
|
||||
|
||||
More information at http://ceph.newdream.net/.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
if CEPH_FS
|
||||
config CEPH_FSCACHE
|
||||
bool "Enable Ceph client caching support"
|
||||
depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
|
||||
help
|
||||
Choose Y here to enable persistent, read-only local
|
||||
caching support for Ceph clients using FS-Cache
|
||||
|
||||
endif
|
||||
|
||||
config CEPH_FS_POSIX_ACL
|
||||
bool "Ceph POSIX Access Control Lists"
|
||||
depends on CEPH_FS
|
||||
select FS_POSIX_ACL
|
||||
help
|
||||
POSIX Access Control Lists (ACLs) support permissions for users and
|
||||
groups beyond the owner/group/world scheme.
|
||||
|
||||
To learn more about Access Control Lists, visit the POSIX ACLs for
|
||||
Linux website <http://acl.bestbits.at/>.
|
||||
|
||||
If you don't know what Access Control Lists are, say N
|
13
fs/ceph/Makefile
Normal file
13
fs/ceph/Makefile
Normal file
|
@ -0,0 +1,13 @@
|
|||
#
|
||||
# Makefile for CEPH filesystem.
|
||||
#
|
||||
|
||||
obj-$(CONFIG_CEPH_FS) += ceph.o
|
||||
|
||||
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
|
||||
export.o caps.o snap.o xattr.o \
|
||||
mds_client.o mdsmap.o strings.o ceph_frag.o \
|
||||
debugfs.o
|
||||
|
||||
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
|
||||
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
|
277
fs/ceph/acl.c
Normal file
277
fs/ceph/acl.c
Normal file
|
@ -0,0 +1,277 @@
|
|||
/*
|
||||
* linux/fs/ceph/acl.c
|
||||
*
|
||||
* Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/ceph/ceph_debug.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/posix_acl_xattr.h>
|
||||
#include <linux/posix_acl.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "super.h"
|
||||
|
||||
static inline void ceph_set_cached_acl(struct inode *inode,
|
||||
int type, struct posix_acl *acl)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
|
||||
set_cached_acl(inode, type, acl);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
}
|
||||
|
||||
static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
|
||||
int type)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct posix_acl *acl = ACL_NOT_CACHED;
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
|
||||
acl = get_cached_acl(inode, type);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
return acl;
|
||||
}
|
||||
|
||||
struct posix_acl *ceph_get_acl(struct inode *inode, int type)
|
||||
{
|
||||
int size;
|
||||
const char *name;
|
||||
char *value = NULL;
|
||||
struct posix_acl *acl;
|
||||
|
||||
switch (type) {
|
||||
case ACL_TYPE_ACCESS:
|
||||
name = POSIX_ACL_XATTR_ACCESS;
|
||||
break;
|
||||
case ACL_TYPE_DEFAULT:
|
||||
name = POSIX_ACL_XATTR_DEFAULT;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
size = __ceph_getxattr(inode, name, "", 0);
|
||||
if (size > 0) {
|
||||
value = kzalloc(size, GFP_NOFS);
|
||||
if (!value)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
size = __ceph_getxattr(inode, name, value, size);
|
||||
}
|
||||
|
||||
if (size > 0)
|
||||
acl = posix_acl_from_xattr(&init_user_ns, value, size);
|
||||
else if (size == -ERANGE || size == -ENODATA || size == 0)
|
||||
acl = NULL;
|
||||
else
|
||||
acl = ERR_PTR(-EIO);
|
||||
|
||||
kfree(value);
|
||||
|
||||
if (!IS_ERR(acl))
|
||||
ceph_set_cached_acl(inode, type, acl);
|
||||
|
||||
return acl;
|
||||
}
|
||||
|
||||
int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
|
||||
{
|
||||
int ret = 0, size = 0;
|
||||
const char *name = NULL;
|
||||
char *value = NULL;
|
||||
struct iattr newattrs;
|
||||
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
|
||||
struct dentry *dentry;
|
||||
|
||||
switch (type) {
|
||||
case ACL_TYPE_ACCESS:
|
||||
name = POSIX_ACL_XATTR_ACCESS;
|
||||
if (acl) {
|
||||
ret = posix_acl_equiv_mode(acl, &new_mode);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
if (ret == 0)
|
||||
acl = NULL;
|
||||
}
|
||||
break;
|
||||
case ACL_TYPE_DEFAULT:
|
||||
if (!S_ISDIR(inode->i_mode)) {
|
||||
ret = acl ? -EINVAL : 0;
|
||||
goto out;
|
||||
}
|
||||
name = POSIX_ACL_XATTR_DEFAULT;
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (acl) {
|
||||
size = posix_acl_xattr_size(acl->a_count);
|
||||
value = kmalloc(size, GFP_NOFS);
|
||||
if (!value) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
|
||||
if (ret < 0)
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
dentry = d_find_alias(inode);
|
||||
if (new_mode != old_mode) {
|
||||
newattrs.ia_mode = new_mode;
|
||||
newattrs.ia_valid = ATTR_MODE;
|
||||
ret = ceph_setattr(dentry, &newattrs);
|
||||
if (ret)
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
ret = __ceph_setxattr(dentry, name, value, size, 0);
|
||||
if (ret) {
|
||||
if (new_mode != old_mode) {
|
||||
newattrs.ia_mode = old_mode;
|
||||
newattrs.ia_valid = ATTR_MODE;
|
||||
ceph_setattr(dentry, &newattrs);
|
||||
}
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
ceph_set_cached_acl(inode, type, acl);
|
||||
|
||||
out_dput:
|
||||
dput(dentry);
|
||||
out_free:
|
||||
kfree(value);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
|
||||
struct ceph_acls_info *info)
|
||||
{
|
||||
struct posix_acl *acl, *default_acl;
|
||||
size_t val_size1 = 0, val_size2 = 0;
|
||||
struct ceph_pagelist *pagelist = NULL;
|
||||
void *tmp_buf = NULL;
|
||||
int err;
|
||||
|
||||
err = posix_acl_create(dir, mode, &default_acl, &acl);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (acl) {
|
||||
int ret = posix_acl_equiv_mode(acl, mode);
|
||||
if (ret < 0)
|
||||
goto out_err;
|
||||
if (ret == 0) {
|
||||
posix_acl_release(acl);
|
||||
acl = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (!default_acl && !acl)
|
||||
return 0;
|
||||
|
||||
if (acl)
|
||||
val_size1 = posix_acl_xattr_size(acl->a_count);
|
||||
if (default_acl)
|
||||
val_size2 = posix_acl_xattr_size(default_acl->a_count);
|
||||
|
||||
err = -ENOMEM;
|
||||
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
|
||||
if (!tmp_buf)
|
||||
goto out_err;
|
||||
pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
|
||||
if (!pagelist)
|
||||
goto out_err;
|
||||
ceph_pagelist_init(pagelist);
|
||||
|
||||
err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
|
||||
if (err)
|
||||
goto out_err;
|
||||
|
||||
ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
|
||||
|
||||
if (acl) {
|
||||
size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
|
||||
err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
|
||||
if (err)
|
||||
goto out_err;
|
||||
ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
|
||||
len);
|
||||
err = posix_acl_to_xattr(&init_user_ns, acl,
|
||||
tmp_buf, val_size1);
|
||||
if (err < 0)
|
||||
goto out_err;
|
||||
ceph_pagelist_encode_32(pagelist, val_size1);
|
||||
ceph_pagelist_append(pagelist, tmp_buf, val_size1);
|
||||
}
|
||||
if (default_acl) {
|
||||
size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
|
||||
err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
|
||||
if (err)
|
||||
goto out_err;
|
||||
err = ceph_pagelist_encode_string(pagelist,
|
||||
POSIX_ACL_XATTR_DEFAULT, len);
|
||||
err = posix_acl_to_xattr(&init_user_ns, default_acl,
|
||||
tmp_buf, val_size2);
|
||||
if (err < 0)
|
||||
goto out_err;
|
||||
ceph_pagelist_encode_32(pagelist, val_size2);
|
||||
ceph_pagelist_append(pagelist, tmp_buf, val_size2);
|
||||
}
|
||||
|
||||
kfree(tmp_buf);
|
||||
|
||||
info->acl = acl;
|
||||
info->default_acl = default_acl;
|
||||
info->pagelist = pagelist;
|
||||
return 0;
|
||||
|
||||
out_err:
|
||||
posix_acl_release(acl);
|
||||
posix_acl_release(default_acl);
|
||||
kfree(tmp_buf);
|
||||
if (pagelist)
|
||||
ceph_pagelist_release(pagelist);
|
||||
return err;
|
||||
}
|
||||
|
||||
void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info)
|
||||
{
|
||||
if (!inode)
|
||||
return;
|
||||
ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl);
|
||||
ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl);
|
||||
}
|
||||
|
||||
void ceph_release_acls_info(struct ceph_acls_info *info)
|
||||
{
|
||||
posix_acl_release(info->acl);
|
||||
posix_acl_release(info->default_acl);
|
||||
if (info->pagelist)
|
||||
ceph_pagelist_release(info->pagelist);
|
||||
}
|
1333
fs/ceph/addr.c
Normal file
1333
fs/ceph/addr.c
Normal file
File diff suppressed because it is too large
Load diff
402
fs/ceph/cache.c
Normal file
402
fs/ceph/cache.c
Normal file
|
@ -0,0 +1,402 @@
|
|||
/*
|
||||
* Ceph cache definitions.
|
||||
*
|
||||
* Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
|
||||
* Written by Milosz Tanski (milosz@adfin.com)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2
|
||||
* as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to:
|
||||
* Free Software Foundation
|
||||
* 51 Franklin Street, Fifth Floor
|
||||
* Boston, MA 02111-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "super.h"
|
||||
#include "cache.h"
|
||||
|
||||
struct ceph_aux_inode {
|
||||
struct timespec mtime;
|
||||
loff_t size;
|
||||
};
|
||||
|
||||
struct fscache_netfs ceph_cache_netfs = {
|
||||
.name = "ceph",
|
||||
.version = 0,
|
||||
};
|
||||
|
||||
static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
|
||||
void *buffer, uint16_t maxbuf)
|
||||
{
|
||||
const struct ceph_fs_client* fsc = cookie_netfs_data;
|
||||
uint16_t klen;
|
||||
|
||||
klen = sizeof(fsc->client->fsid);
|
||||
if (klen > maxbuf)
|
||||
return 0;
|
||||
|
||||
memcpy(buffer, &fsc->client->fsid, klen);
|
||||
return klen;
|
||||
}
|
||||
|
||||
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
|
||||
.name = "CEPH.fsid",
|
||||
.type = FSCACHE_COOKIE_TYPE_INDEX,
|
||||
.get_key = ceph_fscache_session_get_key,
|
||||
};
|
||||
|
||||
int ceph_fscache_register(void)
|
||||
{
|
||||
return fscache_register_netfs(&ceph_cache_netfs);
|
||||
}
|
||||
|
||||
void ceph_fscache_unregister(void)
|
||||
{
|
||||
fscache_unregister_netfs(&ceph_cache_netfs);
|
||||
}
|
||||
|
||||
int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
|
||||
{
|
||||
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
|
||||
&ceph_fscache_fsid_object_def,
|
||||
fsc, true);
|
||||
|
||||
if (fsc->fscache == NULL) {
|
||||
pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
|
||||
return 0;
|
||||
}
|
||||
|
||||
fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1);
|
||||
if (fsc->revalidate_wq == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
|
||||
void *buffer, uint16_t maxbuf)
|
||||
{
|
||||
const struct ceph_inode_info* ci = cookie_netfs_data;
|
||||
uint16_t klen;
|
||||
|
||||
/* use ceph virtual inode (id + snaphot) */
|
||||
klen = sizeof(ci->i_vino);
|
||||
if (klen > maxbuf)
|
||||
return 0;
|
||||
|
||||
memcpy(buffer, &ci->i_vino, klen);
|
||||
return klen;
|
||||
}
|
||||
|
||||
static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
|
||||
void *buffer, uint16_t bufmax)
|
||||
{
|
||||
struct ceph_aux_inode aux;
|
||||
const struct ceph_inode_info* ci = cookie_netfs_data;
|
||||
const struct inode* inode = &ci->vfs_inode;
|
||||
|
||||
memset(&aux, 0, sizeof(aux));
|
||||
aux.mtime = inode->i_mtime;
|
||||
aux.size = inode->i_size;
|
||||
|
||||
memcpy(buffer, &aux, sizeof(aux));
|
||||
|
||||
return sizeof(aux);
|
||||
}
|
||||
|
||||
static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
|
||||
uint64_t *size)
|
||||
{
|
||||
const struct ceph_inode_info* ci = cookie_netfs_data;
|
||||
const struct inode* inode = &ci->vfs_inode;
|
||||
|
||||
*size = inode->i_size;
|
||||
}
|
||||
|
||||
static enum fscache_checkaux ceph_fscache_inode_check_aux(
|
||||
void *cookie_netfs_data, const void *data, uint16_t dlen)
|
||||
{
|
||||
struct ceph_aux_inode aux;
|
||||
struct ceph_inode_info* ci = cookie_netfs_data;
|
||||
struct inode* inode = &ci->vfs_inode;
|
||||
|
||||
if (dlen != sizeof(aux))
|
||||
return FSCACHE_CHECKAUX_OBSOLETE;
|
||||
|
||||
memset(&aux, 0, sizeof(aux));
|
||||
aux.mtime = inode->i_mtime;
|
||||
aux.size = inode->i_size;
|
||||
|
||||
if (memcmp(data, &aux, sizeof(aux)) != 0)
|
||||
return FSCACHE_CHECKAUX_OBSOLETE;
|
||||
|
||||
dout("ceph inode 0x%p cached okay", ci);
|
||||
return FSCACHE_CHECKAUX_OKAY;
|
||||
}
|
||||
|
||||
static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
|
||||
{
|
||||
struct ceph_inode_info* ci = cookie_netfs_data;
|
||||
struct pagevec pvec;
|
||||
pgoff_t first;
|
||||
int loop, nr_pages;
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
first = 0;
|
||||
|
||||
dout("ceph inode 0x%p now uncached", ci);
|
||||
|
||||
while (1) {
|
||||
nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
|
||||
PAGEVEC_SIZE - pagevec_count(&pvec));
|
||||
|
||||
if (!nr_pages)
|
||||
break;
|
||||
|
||||
for (loop = 0; loop < nr_pages; loop++)
|
||||
ClearPageFsCache(pvec.pages[loop]);
|
||||
|
||||
first = pvec.pages[nr_pages - 1]->index + 1;
|
||||
|
||||
pvec.nr = nr_pages;
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
||||
static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
|
||||
.name = "CEPH.inode",
|
||||
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
|
||||
.get_key = ceph_fscache_inode_get_key,
|
||||
.get_attr = ceph_fscache_inode_get_attr,
|
||||
.get_aux = ceph_fscache_inode_get_aux,
|
||||
.check_aux = ceph_fscache_inode_check_aux,
|
||||
.now_uncached = ceph_fscache_inode_now_uncached,
|
||||
};
|
||||
|
||||
void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
|
||||
struct ceph_inode_info* ci)
|
||||
{
|
||||
struct inode* inode = &ci->vfs_inode;
|
||||
|
||||
/* No caching for filesystem */
|
||||
if (fsc->fscache == NULL)
|
||||
return;
|
||||
|
||||
/* Only cache for regular files that are read only */
|
||||
if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
|
||||
return;
|
||||
|
||||
/* Avoid multiple racing open requests */
|
||||
mutex_lock(&inode->i_mutex);
|
||||
|
||||
if (ci->fscache)
|
||||
goto done;
|
||||
|
||||
ci->fscache = fscache_acquire_cookie(fsc->fscache,
|
||||
&ceph_fscache_inode_object_def,
|
||||
ci, true);
|
||||
fscache_check_consistency(ci->fscache);
|
||||
done:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
||||
}
|
||||
|
||||
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
|
||||
{
|
||||
struct fscache_cookie* cookie;
|
||||
|
||||
if ((cookie = ci->fscache) == NULL)
|
||||
return;
|
||||
|
||||
ci->fscache = NULL;
|
||||
|
||||
fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
|
||||
fscache_relinquish_cookie(cookie, 0);
|
||||
}
|
||||
|
||||
static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
|
||||
{
|
||||
if (!error)
|
||||
SetPageUptodate(page);
|
||||
}
|
||||
|
||||
static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
|
||||
{
|
||||
if (!error)
|
||||
SetPageUptodate(page);
|
||||
|
||||
unlock_page(page);
|
||||
}
|
||||
|
||||
static inline int cache_valid(struct ceph_inode_info *ci)
|
||||
{
|
||||
return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
|
||||
(ci->i_fscache_gen == ci->i_rdcache_gen));
|
||||
}
|
||||
|
||||
|
||||
/* Atempt to read from the fscache,
|
||||
*
|
||||
* This function is called from the readpage_nounlock context. DO NOT attempt to
|
||||
* unlock the page here (or in the callback).
|
||||
*/
|
||||
int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
int ret;
|
||||
|
||||
if (!cache_valid(ci))
|
||||
return -ENOBUFS;
|
||||
|
||||
ret = fscache_read_or_alloc_page(ci->fscache, page,
|
||||
ceph_vfs_readpage_complete, NULL,
|
||||
GFP_KERNEL);
|
||||
|
||||
switch (ret) {
|
||||
case 0: /* Page found */
|
||||
dout("page read submitted\n");
|
||||
return 0;
|
||||
case -ENOBUFS: /* Pages were not found, and can't be */
|
||||
case -ENODATA: /* Pages were not found */
|
||||
dout("page/inode not in cache\n");
|
||||
return ret;
|
||||
default:
|
||||
dout("%s: unknown error ret = %i\n", __func__, ret);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
int ceph_readpages_from_fscache(struct inode *inode,
|
||||
struct address_space *mapping,
|
||||
struct list_head *pages,
|
||||
unsigned *nr_pages)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
int ret;
|
||||
|
||||
if (!cache_valid(ci))
|
||||
return -ENOBUFS;
|
||||
|
||||
ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
|
||||
ceph_vfs_readpage_complete_unlock,
|
||||
NULL, mapping_gfp_mask(mapping));
|
||||
|
||||
switch (ret) {
|
||||
case 0: /* All pages found */
|
||||
dout("all-page read submitted\n");
|
||||
return 0;
|
||||
case -ENOBUFS: /* Some pages were not found, and can't be */
|
||||
case -ENODATA: /* some pages were not found */
|
||||
dout("page/inode not in cache\n");
|
||||
return ret;
|
||||
default:
|
||||
dout("%s: unknown error ret = %i\n", __func__, ret);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
int ret;
|
||||
|
||||
if (!PageFsCache(page))
|
||||
return;
|
||||
|
||||
if (!cache_valid(ci))
|
||||
return;
|
||||
|
||||
ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
|
||||
if (ret)
|
||||
fscache_uncache_page(ci->fscache, page);
|
||||
}
|
||||
|
||||
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
|
||||
if (!PageFsCache(page))
|
||||
return;
|
||||
|
||||
fscache_wait_on_page_write(ci->fscache, page);
|
||||
fscache_uncache_page(ci->fscache, page);
|
||||
}
|
||||
|
||||
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
|
||||
{
|
||||
if (fsc->revalidate_wq)
|
||||
destroy_workqueue(fsc->revalidate_wq);
|
||||
|
||||
fscache_relinquish_cookie(fsc->fscache, 0);
|
||||
fsc->fscache = NULL;
|
||||
}
|
||||
|
||||
static void ceph_revalidate_work(struct work_struct *work)
|
||||
{
|
||||
int issued;
|
||||
u32 orig_gen;
|
||||
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
|
||||
i_revalidate_work);
|
||||
struct inode *inode = &ci->vfs_inode;
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
issued = __ceph_caps_issued(ci, NULL);
|
||||
orig_gen = ci->i_rdcache_gen;
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
if (!(issued & CEPH_CAP_FILE_CACHE)) {
|
||||
dout("revalidate_work lost cache before validation %p\n",
|
||||
inode);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!fscache_check_consistency(ci->fscache))
|
||||
fscache_invalidate(ci->fscache);
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
/* Update the new valid generation (backwards sanity check too) */
|
||||
if (orig_gen > ci->i_fscache_gen) {
|
||||
ci->i_fscache_gen = orig_gen;
|
||||
}
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
out:
|
||||
iput(&ci->vfs_inode);
|
||||
}
|
||||
|
||||
void ceph_queue_revalidate(struct inode *inode)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
|
||||
if (fsc->revalidate_wq == NULL || ci->fscache == NULL)
|
||||
return;
|
||||
|
||||
ihold(inode);
|
||||
|
||||
if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq,
|
||||
&ci->i_revalidate_work)) {
|
||||
dout("ceph_queue_revalidate %p\n", inode);
|
||||
} else {
|
||||
dout("ceph_queue_revalidate %p failed\n)", inode);
|
||||
iput(inode);
|
||||
}
|
||||
}
|
||||
|
||||
void ceph_fscache_inode_init(struct ceph_inode_info *ci)
|
||||
{
|
||||
ci->fscache = NULL;
|
||||
/* The first load is verifed cookie open time */
|
||||
ci->i_fscache_gen = 1;
|
||||
INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work);
|
||||
}
|
182
fs/ceph/cache.h
Normal file
182
fs/ceph/cache.h
Normal file
|
@ -0,0 +1,182 @@
|
|||
/*
|
||||
* Ceph cache definitions.
|
||||
*
|
||||
* Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
|
||||
* Written by Milosz Tanski (milosz@adfin.com)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2
|
||||
* as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to:
|
||||
* Free Software Foundation
|
||||
* 51 Franklin Street, Fifth Floor
|
||||
* Boston, MA 02111-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _CEPH_CACHE_H
|
||||
#define _CEPH_CACHE_H
|
||||
|
||||
#ifdef CONFIG_CEPH_FSCACHE
|
||||
|
||||
extern struct fscache_netfs ceph_cache_netfs;
|
||||
|
||||
int ceph_fscache_register(void);
|
||||
void ceph_fscache_unregister(void);
|
||||
|
||||
int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
|
||||
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
|
||||
|
||||
void ceph_fscache_inode_init(struct ceph_inode_info *ci);
|
||||
void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
|
||||
struct ceph_inode_info* ci);
|
||||
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
|
||||
|
||||
int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
|
||||
int ceph_readpages_from_fscache(struct inode *inode,
|
||||
struct address_space *mapping,
|
||||
struct list_head *pages,
|
||||
unsigned *nr_pages);
|
||||
void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
|
||||
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
|
||||
void ceph_queue_revalidate(struct inode *inode);
|
||||
|
||||
static inline void ceph_fscache_update_objectsize(struct inode *inode)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
fscache_attr_changed(ci->fscache);
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_invalidate(struct inode *inode)
|
||||
{
|
||||
fscache_invalidate(ceph_inode(inode)->fscache);
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_uncache_page(struct inode *inode,
|
||||
struct page *page)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
return fscache_uncache_page(ci->fscache, page);
|
||||
}
|
||||
|
||||
static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
|
||||
{
|
||||
struct inode* inode = page->mapping->host;
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
return fscache_maybe_release_page(ci->fscache, page, gfp);
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_readpage_cancel(struct inode *inode,
|
||||
struct page *page)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
|
||||
__fscache_uncache_page(ci->fscache, page);
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
|
||||
struct list_head *pages)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
return fscache_readpages_cancel(ci->fscache, pages);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline int ceph_fscache_register(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_unregister(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
|
||||
struct ceph_inode_info* ci)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_uncache_page(struct inode *inode,
|
||||
struct page *pages)
|
||||
{
|
||||
}
|
||||
|
||||
static inline int ceph_readpage_from_fscache(struct inode* inode,
|
||||
struct page *page)
|
||||
{
|
||||
return -ENOBUFS;
|
||||
}
|
||||
|
||||
static inline int ceph_readpages_from_fscache(struct inode *inode,
|
||||
struct address_space *mapping,
|
||||
struct list_head *pages,
|
||||
unsigned *nr_pages)
|
||||
{
|
||||
return -ENOBUFS;
|
||||
}
|
||||
|
||||
static inline void ceph_readpage_to_fscache(struct inode *inode,
|
||||
struct page *page)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_update_objectsize(struct inode *inode)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_invalidate(struct inode *inode)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void ceph_invalidate_fscache_page(struct inode *inode,
|
||||
struct page *page)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
|
||||
{
|
||||
}
|
||||
|
||||
static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_readpage_cancel(struct inode *inode,
|
||||
struct page *page)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
|
||||
struct list_head *pages)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void ceph_queue_revalidate(struct inode *inode)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
3332
fs/ceph/caps.c
Normal file
3332
fs/ceph/caps.c
Normal file
File diff suppressed because it is too large
Load diff
22
fs/ceph/ceph_frag.c
Normal file
22
fs/ceph/ceph_frag.c
Normal file
|
@ -0,0 +1,22 @@
|
|||
/*
|
||||
* Ceph 'frag' type
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/ceph/types.h>
|
||||
|
||||
int ceph_frag_compare(__u32 a, __u32 b)
|
||||
{
|
||||
unsigned va = ceph_frag_value(a);
|
||||
unsigned vb = ceph_frag_value(b);
|
||||
if (va < vb)
|
||||
return -1;
|
||||
if (va > vb)
|
||||
return 1;
|
||||
va = ceph_frag_bits(a);
|
||||
vb = ceph_frag_bits(b);
|
||||
if (va < vb)
|
||||
return -1;
|
||||
if (va > vb)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
323
fs/ceph/debugfs.c
Normal file
323
fs/ceph/debugfs.c
Normal file
|
@ -0,0 +1,323 @@
|
|||
#include <linux/ceph/ceph_debug.h>
|
||||
|
||||
#include <linux/device.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
#include <linux/ceph/libceph.h>
|
||||
#include <linux/ceph/mon_client.h>
|
||||
#include <linux/ceph/auth.h>
|
||||
#include <linux/ceph/debugfs.h>
|
||||
|
||||
#include "super.h"
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
|
||||
#include "mds_client.h"
|
||||
|
||||
static int mdsmap_show(struct seq_file *s, void *p)
|
||||
{
|
||||
int i;
|
||||
struct ceph_fs_client *fsc = s->private;
|
||||
|
||||
if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
|
||||
return 0;
|
||||
seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
|
||||
seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
|
||||
seq_printf(s, "session_timeout %d\n",
|
||||
fsc->mdsc->mdsmap->m_session_timeout);
|
||||
seq_printf(s, "session_autoclose %d\n",
|
||||
fsc->mdsc->mdsmap->m_session_autoclose);
|
||||
for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
|
||||
struct ceph_entity_addr *addr =
|
||||
&fsc->mdsc->mdsmap->m_info[i].addr;
|
||||
int state = fsc->mdsc->mdsmap->m_info[i].state;
|
||||
|
||||
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
|
||||
ceph_pr_addr(&addr->in_addr),
|
||||
ceph_mds_state_name(state));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* mdsc debugfs
|
||||
*/
|
||||
static int mdsc_show(struct seq_file *s, void *p)
|
||||
{
|
||||
struct ceph_fs_client *fsc = s->private;
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_mds_request *req;
|
||||
struct rb_node *rp;
|
||||
int pathlen;
|
||||
u64 pathbase;
|
||||
char *path;
|
||||
|
||||
mutex_lock(&mdsc->mutex);
|
||||
for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
|
||||
req = rb_entry(rp, struct ceph_mds_request, r_node);
|
||||
|
||||
if (req->r_request && req->r_session)
|
||||
seq_printf(s, "%lld\tmds%d\t", req->r_tid,
|
||||
req->r_session->s_mds);
|
||||
else if (!req->r_request)
|
||||
seq_printf(s, "%lld\t(no request)\t", req->r_tid);
|
||||
else
|
||||
seq_printf(s, "%lld\t(no session)\t", req->r_tid);
|
||||
|
||||
seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
|
||||
|
||||
if (req->r_got_unsafe)
|
||||
seq_puts(s, "\t(unsafe)");
|
||||
else
|
||||
seq_puts(s, "\t");
|
||||
|
||||
if (req->r_inode) {
|
||||
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
|
||||
} else if (req->r_dentry) {
|
||||
path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
|
||||
&pathbase, 0);
|
||||
if (IS_ERR(path))
|
||||
path = NULL;
|
||||
spin_lock(&req->r_dentry->d_lock);
|
||||
seq_printf(s, " #%llx/%.*s (%s)",
|
||||
ceph_ino(req->r_dentry->d_parent->d_inode),
|
||||
req->r_dentry->d_name.len,
|
||||
req->r_dentry->d_name.name,
|
||||
path ? path : "");
|
||||
spin_unlock(&req->r_dentry->d_lock);
|
||||
kfree(path);
|
||||
} else if (req->r_path1) {
|
||||
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
|
||||
req->r_path1);
|
||||
} else {
|
||||
seq_printf(s, " #%llx", req->r_ino1.ino);
|
||||
}
|
||||
|
||||
if (req->r_old_dentry) {
|
||||
path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
|
||||
&pathbase, 0);
|
||||
if (IS_ERR(path))
|
||||
path = NULL;
|
||||
spin_lock(&req->r_old_dentry->d_lock);
|
||||
seq_printf(s, " #%llx/%.*s (%s)",
|
||||
req->r_old_dentry_dir ?
|
||||
ceph_ino(req->r_old_dentry_dir) : 0,
|
||||
req->r_old_dentry->d_name.len,
|
||||
req->r_old_dentry->d_name.name,
|
||||
path ? path : "");
|
||||
spin_unlock(&req->r_old_dentry->d_lock);
|
||||
kfree(path);
|
||||
} else if (req->r_path2) {
|
||||
if (req->r_ino2.ino)
|
||||
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
|
||||
req->r_path2);
|
||||
else
|
||||
seq_printf(s, " %s", req->r_path2);
|
||||
}
|
||||
|
||||
seq_puts(s, "\n");
|
||||
}
|
||||
mutex_unlock(&mdsc->mutex);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int caps_show(struct seq_file *s, void *p)
|
||||
{
|
||||
struct ceph_fs_client *fsc = s->private;
|
||||
int total, avail, used, reserved, min;
|
||||
|
||||
ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
|
||||
seq_printf(s, "total\t\t%d\n"
|
||||
"avail\t\t%d\n"
|
||||
"used\t\t%d\n"
|
||||
"reserved\t%d\n"
|
||||
"min\t%d\n",
|
||||
total, avail, used, reserved, min);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dentry_lru_show(struct seq_file *s, void *ptr)
|
||||
{
|
||||
struct ceph_fs_client *fsc = s->private;
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_dentry_info *di;
|
||||
|
||||
spin_lock(&mdsc->dentry_lru_lock);
|
||||
list_for_each_entry(di, &mdsc->dentry_lru, lru) {
|
||||
struct dentry *dentry = di->dentry;
|
||||
seq_printf(s, "%p %p\t%.*s\n",
|
||||
di, dentry, dentry->d_name.len, dentry->d_name.name);
|
||||
}
|
||||
spin_unlock(&mdsc->dentry_lru_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mds_sessions_show(struct seq_file *s, void *ptr)
|
||||
{
|
||||
struct ceph_fs_client *fsc = s->private;
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_auth_client *ac = fsc->client->monc.auth;
|
||||
struct ceph_options *opt = fsc->client->options;
|
||||
int mds = -1;
|
||||
|
||||
mutex_lock(&mdsc->mutex);
|
||||
|
||||
/* The 'num' portion of an 'entity name' */
|
||||
seq_printf(s, "global_id %llu\n", ac->global_id);
|
||||
|
||||
/* The -o name mount argument */
|
||||
seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : "");
|
||||
|
||||
/* The list of MDS session rank+state */
|
||||
for (mds = 0; mds < mdsc->max_sessions; mds++) {
|
||||
struct ceph_mds_session *session =
|
||||
__ceph_lookup_mds_session(mdsc, mds);
|
||||
if (!session) {
|
||||
continue;
|
||||
}
|
||||
mutex_unlock(&mdsc->mutex);
|
||||
seq_printf(s, "mds.%d %s\n",
|
||||
session->s_mds,
|
||||
ceph_session_state_name(session->s_state));
|
||||
|
||||
ceph_put_mds_session(session);
|
||||
mutex_lock(&mdsc->mutex);
|
||||
}
|
||||
mutex_unlock(&mdsc->mutex);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
|
||||
CEPH_DEFINE_SHOW_FUNC(mdsc_show)
|
||||
CEPH_DEFINE_SHOW_FUNC(caps_show)
|
||||
CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
|
||||
CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
|
||||
|
||||
|
||||
/*
|
||||
* debugfs
|
||||
*/
|
||||
static int congestion_kb_set(void *data, u64 val)
|
||||
{
|
||||
struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
|
||||
|
||||
fsc->mount_options->congestion_kb = (int)val;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int congestion_kb_get(void *data, u64 *val)
|
||||
{
|
||||
struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
|
||||
|
||||
*val = (u64)fsc->mount_options->congestion_kb;
|
||||
return 0;
|
||||
}
|
||||
|
||||
DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
|
||||
congestion_kb_set, "%llu\n");
|
||||
|
||||
|
||||
void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
|
||||
{
|
||||
dout("ceph_fs_debugfs_cleanup\n");
|
||||
debugfs_remove(fsc->debugfs_bdi);
|
||||
debugfs_remove(fsc->debugfs_congestion_kb);
|
||||
debugfs_remove(fsc->debugfs_mdsmap);
|
||||
debugfs_remove(fsc->debugfs_mds_sessions);
|
||||
debugfs_remove(fsc->debugfs_caps);
|
||||
debugfs_remove(fsc->debugfs_mdsc);
|
||||
debugfs_remove(fsc->debugfs_dentry_lru);
|
||||
}
|
||||
|
||||
int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
|
||||
{
|
||||
char name[100];
|
||||
int err = -ENOMEM;
|
||||
|
||||
dout("ceph_fs_debugfs_init\n");
|
||||
BUG_ON(!fsc->client->debugfs_dir);
|
||||
fsc->debugfs_congestion_kb =
|
||||
debugfs_create_file("writeback_congestion_kb",
|
||||
0600,
|
||||
fsc->client->debugfs_dir,
|
||||
fsc,
|
||||
&congestion_kb_fops);
|
||||
if (!fsc->debugfs_congestion_kb)
|
||||
goto out;
|
||||
|
||||
snprintf(name, sizeof(name), "../../bdi/%s",
|
||||
dev_name(fsc->backing_dev_info.dev));
|
||||
fsc->debugfs_bdi =
|
||||
debugfs_create_symlink("bdi",
|
||||
fsc->client->debugfs_dir,
|
||||
name);
|
||||
if (!fsc->debugfs_bdi)
|
||||
goto out;
|
||||
|
||||
fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
|
||||
0600,
|
||||
fsc->client->debugfs_dir,
|
||||
fsc,
|
||||
&mdsmap_show_fops);
|
||||
if (!fsc->debugfs_mdsmap)
|
||||
goto out;
|
||||
|
||||
fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
|
||||
0600,
|
||||
fsc->client->debugfs_dir,
|
||||
fsc,
|
||||
&mds_sessions_show_fops);
|
||||
if (!fsc->debugfs_mds_sessions)
|
||||
goto out;
|
||||
|
||||
fsc->debugfs_mdsc = debugfs_create_file("mdsc",
|
||||
0600,
|
||||
fsc->client->debugfs_dir,
|
||||
fsc,
|
||||
&mdsc_show_fops);
|
||||
if (!fsc->debugfs_mdsc)
|
||||
goto out;
|
||||
|
||||
fsc->debugfs_caps = debugfs_create_file("caps",
|
||||
0400,
|
||||
fsc->client->debugfs_dir,
|
||||
fsc,
|
||||
&caps_show_fops);
|
||||
if (!fsc->debugfs_caps)
|
||||
goto out;
|
||||
|
||||
fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
|
||||
0600,
|
||||
fsc->client->debugfs_dir,
|
||||
fsc,
|
||||
&dentry_lru_show_fops);
|
||||
if (!fsc->debugfs_dentry_lru)
|
||||
goto out;
|
||||
|
||||
return 0;
|
||||
|
||||
out:
|
||||
ceph_fs_debugfs_cleanup(fsc);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
#else /* CONFIG_DEBUG_FS */
|
||||
|
||||
int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_DEBUG_FS */
|
1369
fs/ceph/dir.c
Normal file
1369
fs/ceph/dir.c
Normal file
File diff suppressed because it is too large
Load diff
250
fs/ceph/export.c
Normal file
250
fs/ceph/export.c
Normal file
|
@ -0,0 +1,250 @@
|
|||
#include <linux/ceph/ceph_debug.h>
|
||||
|
||||
#include <linux/exportfs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
#include "super.h"
|
||||
#include "mds_client.h"
|
||||
|
||||
/*
|
||||
* Basic fh
|
||||
*/
|
||||
struct ceph_nfs_fh {
|
||||
u64 ino;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/*
|
||||
* Larger fh that includes parent ino.
|
||||
*/
|
||||
struct ceph_nfs_confh {
|
||||
u64 ino, parent_ino;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
|
||||
struct inode *parent_inode)
|
||||
{
|
||||
int type;
|
||||
struct ceph_nfs_fh *fh = (void *)rawfh;
|
||||
struct ceph_nfs_confh *cfh = (void *)rawfh;
|
||||
int connected_handle_length = sizeof(*cfh)/4;
|
||||
int handle_length = sizeof(*fh)/4;
|
||||
|
||||
/* don't re-export snaps */
|
||||
if (ceph_snap(inode) != CEPH_NOSNAP)
|
||||
return -EINVAL;
|
||||
|
||||
if (parent_inode && (*max_len < connected_handle_length)) {
|
||||
*max_len = connected_handle_length;
|
||||
return FILEID_INVALID;
|
||||
} else if (*max_len < handle_length) {
|
||||
*max_len = handle_length;
|
||||
return FILEID_INVALID;
|
||||
}
|
||||
|
||||
if (parent_inode) {
|
||||
dout("encode_fh %llx with parent %llx\n",
|
||||
ceph_ino(inode), ceph_ino(parent_inode));
|
||||
cfh->ino = ceph_ino(inode);
|
||||
cfh->parent_ino = ceph_ino(parent_inode);
|
||||
*max_len = connected_handle_length;
|
||||
type = FILEID_INO32_GEN_PARENT;
|
||||
} else {
|
||||
dout("encode_fh %llx\n", ceph_ino(inode));
|
||||
fh->ino = ceph_ino(inode);
|
||||
*max_len = handle_length;
|
||||
type = FILEID_INO32_GEN;
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
|
||||
{
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
|
||||
struct inode *inode;
|
||||
struct dentry *dentry;
|
||||
struct ceph_vino vino;
|
||||
int err;
|
||||
|
||||
vino.ino = ino;
|
||||
vino.snap = CEPH_NOSNAP;
|
||||
inode = ceph_find_inode(sb, vino);
|
||||
if (!inode) {
|
||||
struct ceph_mds_request *req;
|
||||
|
||||
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
|
||||
USE_ANY_MDS);
|
||||
if (IS_ERR(req))
|
||||
return ERR_CAST(req);
|
||||
|
||||
req->r_ino1 = vino;
|
||||
req->r_num_caps = 1;
|
||||
err = ceph_mdsc_do_request(mdsc, NULL, req);
|
||||
inode = req->r_target_inode;
|
||||
if (inode)
|
||||
ihold(inode);
|
||||
ceph_mdsc_put_request(req);
|
||||
if (!inode)
|
||||
return ERR_PTR(-ESTALE);
|
||||
}
|
||||
|
||||
dentry = d_obtain_alias(inode);
|
||||
if (IS_ERR(dentry)) {
|
||||
iput(inode);
|
||||
return dentry;
|
||||
}
|
||||
err = ceph_init_dentry(dentry);
|
||||
if (err < 0) {
|
||||
dput(dentry);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
|
||||
return dentry;
|
||||
}
|
||||
|
||||
/*
|
||||
* convert regular fh to dentry
|
||||
*/
|
||||
static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
|
||||
struct fid *fid,
|
||||
int fh_len, int fh_type)
|
||||
{
|
||||
struct ceph_nfs_fh *fh = (void *)fid->raw;
|
||||
|
||||
if (fh_type != FILEID_INO32_GEN &&
|
||||
fh_type != FILEID_INO32_GEN_PARENT)
|
||||
return NULL;
|
||||
if (fh_len < sizeof(*fh) / 4)
|
||||
return NULL;
|
||||
|
||||
dout("fh_to_dentry %llx\n", fh->ino);
|
||||
return __fh_to_dentry(sb, fh->ino);
|
||||
}
|
||||
|
||||
static struct dentry *__get_parent(struct super_block *sb,
|
||||
struct dentry *child, u64 ino)
|
||||
{
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
|
||||
struct ceph_mds_request *req;
|
||||
struct inode *inode;
|
||||
struct dentry *dentry;
|
||||
int err;
|
||||
|
||||
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
|
||||
USE_ANY_MDS);
|
||||
if (IS_ERR(req))
|
||||
return ERR_CAST(req);
|
||||
|
||||
if (child) {
|
||||
req->r_inode = child->d_inode;
|
||||
ihold(child->d_inode);
|
||||
} else {
|
||||
req->r_ino1 = (struct ceph_vino) {
|
||||
.ino = ino,
|
||||
.snap = CEPH_NOSNAP,
|
||||
};
|
||||
}
|
||||
req->r_num_caps = 1;
|
||||
err = ceph_mdsc_do_request(mdsc, NULL, req);
|
||||
inode = req->r_target_inode;
|
||||
if (inode)
|
||||
ihold(inode);
|
||||
ceph_mdsc_put_request(req);
|
||||
if (!inode)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
dentry = d_obtain_alias(inode);
|
||||
if (IS_ERR(dentry)) {
|
||||
iput(inode);
|
||||
return dentry;
|
||||
}
|
||||
err = ceph_init_dentry(dentry);
|
||||
if (err < 0) {
|
||||
dput(dentry);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
|
||||
child ? ceph_ino(child->d_inode) : ino,
|
||||
dentry, ceph_vinop(inode));
|
||||
return dentry;
|
||||
}
|
||||
|
||||
static struct dentry *ceph_get_parent(struct dentry *child)
|
||||
{
|
||||
/* don't re-export snaps */
|
||||
if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
dout("get_parent %p ino %llx.%llx\n",
|
||||
child, ceph_vinop(child->d_inode));
|
||||
return __get_parent(child->d_sb, child, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* convert regular fh to parent
|
||||
*/
|
||||
static struct dentry *ceph_fh_to_parent(struct super_block *sb,
|
||||
struct fid *fid,
|
||||
int fh_len, int fh_type)
|
||||
{
|
||||
struct ceph_nfs_confh *cfh = (void *)fid->raw;
|
||||
struct dentry *dentry;
|
||||
|
||||
if (fh_type != FILEID_INO32_GEN_PARENT)
|
||||
return NULL;
|
||||
if (fh_len < sizeof(*cfh) / 4)
|
||||
return NULL;
|
||||
|
||||
dout("fh_to_parent %llx\n", cfh->parent_ino);
|
||||
dentry = __get_parent(sb, NULL, cfh->ino);
|
||||
if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
|
||||
dentry = __fh_to_dentry(sb, cfh->parent_ino);
|
||||
return dentry;
|
||||
}
|
||||
|
||||
static int ceph_get_name(struct dentry *parent, char *name,
|
||||
struct dentry *child)
|
||||
{
|
||||
struct ceph_mds_client *mdsc;
|
||||
struct ceph_mds_request *req;
|
||||
int err;
|
||||
|
||||
mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
|
||||
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
|
||||
USE_ANY_MDS);
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
|
||||
mutex_lock(&parent->d_inode->i_mutex);
|
||||
|
||||
req->r_inode = child->d_inode;
|
||||
ihold(child->d_inode);
|
||||
req->r_ino2 = ceph_vino(parent->d_inode);
|
||||
req->r_locked_dir = parent->d_inode;
|
||||
req->r_num_caps = 2;
|
||||
err = ceph_mdsc_do_request(mdsc, NULL, req);
|
||||
|
||||
mutex_unlock(&parent->d_inode->i_mutex);
|
||||
|
||||
if (!err) {
|
||||
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
|
||||
memcpy(name, rinfo->dname, rinfo->dname_len);
|
||||
name[rinfo->dname_len] = 0;
|
||||
dout("get_name %p ino %llx.%llx name %s\n",
|
||||
child, ceph_vinop(child->d_inode), name);
|
||||
} else {
|
||||
dout("get_name %p ino %llx.%llx err %d\n",
|
||||
child, ceph_vinop(child->d_inode), err);
|
||||
}
|
||||
|
||||
ceph_mdsc_put_request(req);
|
||||
return err;
|
||||
}
|
||||
|
||||
const struct export_operations ceph_export_ops = {
|
||||
.encode_fh = ceph_encode_fh,
|
||||
.fh_to_dentry = ceph_fh_to_dentry,
|
||||
.fh_to_parent = ceph_fh_to_parent,
|
||||
.get_parent = ceph_get_parent,
|
||||
.get_name = ceph_get_name,
|
||||
};
|
1273
fs/ceph/file.c
Normal file
1273
fs/ceph/file.c
Normal file
File diff suppressed because it is too large
Load diff
1982
fs/ceph/inode.c
Normal file
1982
fs/ceph/inode.c
Normal file
File diff suppressed because it is too large
Load diff
295
fs/ceph/ioctl.c
Normal file
295
fs/ceph/ioctl.c
Normal file
|
@ -0,0 +1,295 @@
|
|||
#include <linux/ceph/ceph_debug.h>
|
||||
#include <linux/in.h>
|
||||
|
||||
#include "super.h"
|
||||
#include "mds_client.h"
|
||||
#include "ioctl.h"
|
||||
|
||||
|
||||
/*
|
||||
* ioctls
|
||||
*/
|
||||
|
||||
/*
|
||||
* get and set the file layout
|
||||
*/
|
||||
static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(file_inode(file));
|
||||
struct ceph_ioctl_layout l;
|
||||
int err;
|
||||
|
||||
err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
|
||||
if (!err) {
|
||||
l.stripe_unit = ceph_file_layout_su(ci->i_layout);
|
||||
l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
|
||||
l.object_size = ceph_file_layout_object_size(ci->i_layout);
|
||||
l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
|
||||
l.preferred_osd = (s32)-1;
|
||||
if (copy_to_user(arg, &l, sizeof(l)))
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static long __validate_layout(struct ceph_mds_client *mdsc,
|
||||
struct ceph_ioctl_layout *l)
|
||||
{
|
||||
int i, err;
|
||||
|
||||
/* validate striping parameters */
|
||||
if ((l->object_size & ~PAGE_MASK) ||
|
||||
(l->stripe_unit & ~PAGE_MASK) ||
|
||||
((unsigned)l->stripe_unit != 0 &&
|
||||
((unsigned)l->object_size % (unsigned)l->stripe_unit)))
|
||||
return -EINVAL;
|
||||
|
||||
/* make sure it's a valid data pool */
|
||||
mutex_lock(&mdsc->mutex);
|
||||
err = -EINVAL;
|
||||
for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
|
||||
if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) {
|
||||
err = 0;
|
||||
break;
|
||||
}
|
||||
mutex_unlock(&mdsc->mutex);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||
struct ceph_mds_request *req;
|
||||
struct ceph_ioctl_layout l;
|
||||
struct ceph_inode_info *ci = ceph_inode(file_inode(file));
|
||||
struct ceph_ioctl_layout nl;
|
||||
int err;
|
||||
|
||||
if (copy_from_user(&l, arg, sizeof(l)))
|
||||
return -EFAULT;
|
||||
|
||||
/* validate changed params against current layout */
|
||||
err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
memset(&nl, 0, sizeof(nl));
|
||||
if (l.stripe_count)
|
||||
nl.stripe_count = l.stripe_count;
|
||||
else
|
||||
nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
|
||||
if (l.stripe_unit)
|
||||
nl.stripe_unit = l.stripe_unit;
|
||||
else
|
||||
nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
|
||||
if (l.object_size)
|
||||
nl.object_size = l.object_size;
|
||||
else
|
||||
nl.object_size = ceph_file_layout_object_size(ci->i_layout);
|
||||
if (l.data_pool)
|
||||
nl.data_pool = l.data_pool;
|
||||
else
|
||||
nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
|
||||
|
||||
/* this is obsolete, and always -1 */
|
||||
nl.preferred_osd = le64_to_cpu(-1);
|
||||
|
||||
err = __validate_layout(mdsc, &nl);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
|
||||
USE_AUTH_MDS);
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
req->r_inode = inode;
|
||||
ihold(inode);
|
||||
req->r_num_caps = 1;
|
||||
|
||||
req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
|
||||
|
||||
req->r_args.setlayout.layout.fl_stripe_unit =
|
||||
cpu_to_le32(l.stripe_unit);
|
||||
req->r_args.setlayout.layout.fl_stripe_count =
|
||||
cpu_to_le32(l.stripe_count);
|
||||
req->r_args.setlayout.layout.fl_object_size =
|
||||
cpu_to_le32(l.object_size);
|
||||
req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
|
||||
|
||||
err = ceph_mdsc_do_request(mdsc, NULL, req);
|
||||
ceph_mdsc_put_request(req);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set a layout policy on a directory inode. All items in the tree
|
||||
* rooted at this inode will inherit this layout on creation,
|
||||
* (It doesn't apply retroactively )
|
||||
* unless a subdirectory has its own layout policy.
|
||||
*/
|
||||
static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct ceph_mds_request *req;
|
||||
struct ceph_ioctl_layout l;
|
||||
int err;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||
|
||||
/* copy and validate */
|
||||
if (copy_from_user(&l, arg, sizeof(l)))
|
||||
return -EFAULT;
|
||||
|
||||
err = __validate_layout(mdsc, &l);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
|
||||
USE_AUTH_MDS);
|
||||
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
req->r_inode = inode;
|
||||
ihold(inode);
|
||||
req->r_num_caps = 1;
|
||||
|
||||
req->r_args.setlayout.layout.fl_stripe_unit =
|
||||
cpu_to_le32(l.stripe_unit);
|
||||
req->r_args.setlayout.layout.fl_stripe_count =
|
||||
cpu_to_le32(l.stripe_count);
|
||||
req->r_args.setlayout.layout.fl_object_size =
|
||||
cpu_to_le32(l.object_size);
|
||||
req->r_args.setlayout.layout.fl_pg_pool =
|
||||
cpu_to_le32(l.data_pool);
|
||||
|
||||
err = ceph_mdsc_do_request(mdsc, inode, req);
|
||||
ceph_mdsc_put_request(req);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return object name, size/offset information, and location (OSD
|
||||
* number, network address) for a given file offset.
|
||||
*/
|
||||
static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
|
||||
{
|
||||
struct ceph_ioctl_dataloc dl;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_osd_client *osdc =
|
||||
&ceph_sb_to_client(inode->i_sb)->client->osdc;
|
||||
struct ceph_object_locator oloc;
|
||||
struct ceph_object_id oid;
|
||||
u64 len = 1, olen;
|
||||
u64 tmp;
|
||||
struct ceph_pg pgid;
|
||||
int r;
|
||||
|
||||
/* copy and validate */
|
||||
if (copy_from_user(&dl, arg, sizeof(dl)))
|
||||
return -EFAULT;
|
||||
|
||||
down_read(&osdc->map_sem);
|
||||
r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
|
||||
&dl.object_no, &dl.object_offset,
|
||||
&olen);
|
||||
if (r < 0) {
|
||||
up_read(&osdc->map_sem);
|
||||
return -EIO;
|
||||
}
|
||||
dl.file_offset -= dl.object_offset;
|
||||
dl.object_size = ceph_file_layout_object_size(ci->i_layout);
|
||||
dl.block_size = ceph_file_layout_su(ci->i_layout);
|
||||
|
||||
/* block_offset = object_offset % block_size */
|
||||
tmp = dl.object_offset;
|
||||
dl.block_offset = do_div(tmp, dl.block_size);
|
||||
|
||||
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
|
||||
ceph_ino(inode), dl.object_no);
|
||||
|
||||
oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
|
||||
ceph_oid_set_name(&oid, dl.object_name);
|
||||
|
||||
r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
|
||||
if (r < 0) {
|
||||
up_read(&osdc->map_sem);
|
||||
return r;
|
||||
}
|
||||
|
||||
dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
|
||||
if (dl.osd >= 0) {
|
||||
struct ceph_entity_addr *a =
|
||||
ceph_osd_addr(osdc->osdmap, dl.osd);
|
||||
if (a)
|
||||
memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
|
||||
} else {
|
||||
memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
|
||||
}
|
||||
up_read(&osdc->map_sem);
|
||||
|
||||
/* send result back to user */
|
||||
if (copy_to_user(arg, &dl, sizeof(dl)))
|
||||
return -EFAULT;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long ceph_ioctl_lazyio(struct file *file)
|
||||
{
|
||||
struct ceph_file_info *fi = file->private_data;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
|
||||
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
ci->i_nr_by_mode[fi->fmode]--;
|
||||
fi->fmode |= CEPH_FILE_MODE_LAZY;
|
||||
ci->i_nr_by_mode[fi->fmode]++;
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
dout("ioctl_layzio: file %p marked lazy\n", file);
|
||||
|
||||
ceph_check_caps(ci, 0, NULL);
|
||||
} else {
|
||||
dout("ioctl_layzio: file %p already lazy\n", file);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long ceph_ioctl_syncio(struct file *file)
|
||||
{
|
||||
struct ceph_file_info *fi = file->private_data;
|
||||
|
||||
fi->flags |= CEPH_F_SYNC;
|
||||
return 0;
|
||||
}
|
||||
|
||||
long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
|
||||
switch (cmd) {
|
||||
case CEPH_IOC_GET_LAYOUT:
|
||||
return ceph_ioctl_get_layout(file, (void __user *)arg);
|
||||
|
||||
case CEPH_IOC_SET_LAYOUT:
|
||||
return ceph_ioctl_set_layout(file, (void __user *)arg);
|
||||
|
||||
case CEPH_IOC_SET_LAYOUT_POLICY:
|
||||
return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
|
||||
|
||||
case CEPH_IOC_GET_DATALOC:
|
||||
return ceph_ioctl_get_dataloc(file, (void __user *)arg);
|
||||
|
||||
case CEPH_IOC_LAZYIO:
|
||||
return ceph_ioctl_lazyio(file);
|
||||
|
||||
case CEPH_IOC_SYNCIO:
|
||||
return ceph_ioctl_syncio(file);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
}
|
100
fs/ceph/ioctl.h
Normal file
100
fs/ceph/ioctl.h
Normal file
|
@ -0,0 +1,100 @@
|
|||
#ifndef FS_CEPH_IOCTL_H
|
||||
#define FS_CEPH_IOCTL_H
|
||||
|
||||
#include <linux/ioctl.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#define CEPH_IOCTL_MAGIC 0x97
|
||||
|
||||
/*
|
||||
* CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy
|
||||
* CEPH_IOC_SET_LAYOUT - set file layout
|
||||
* CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy
|
||||
*
|
||||
* The file layout specifies how file data is striped over objects in
|
||||
* the distributed object store, which object pool they belong to (if
|
||||
* it differs from the default), and an optional 'preferred osd' to
|
||||
* store them on.
|
||||
*
|
||||
* Files get a new layout based on the policy set on the containing
|
||||
* directory or one of its ancestors. The GET_LAYOUT ioctl will let
|
||||
* you examine the layout for a file or the policy on a directory.
|
||||
*
|
||||
* SET_LAYOUT will let you set a layout on a newly created file. This
|
||||
* only works immediately after the file is created and before any
|
||||
* data is written to it.
|
||||
*
|
||||
* SET_LAYOUT_POLICY will let you set a layout policy (default layout)
|
||||
* on a directory that will apply to any new files created in that
|
||||
* directory (or any child directory that doesn't specify a layout of
|
||||
* its own).
|
||||
*/
|
||||
|
||||
/* use u64 to align sanely on all archs */
|
||||
struct ceph_ioctl_layout {
|
||||
__u64 stripe_unit, stripe_count, object_size;
|
||||
__u64 data_pool;
|
||||
|
||||
/* obsolete. new values ignored, always return -1 */
|
||||
__s64 preferred_osd;
|
||||
};
|
||||
|
||||
#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
|
||||
struct ceph_ioctl_layout)
|
||||
#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
|
||||
struct ceph_ioctl_layout)
|
||||
#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
|
||||
struct ceph_ioctl_layout)
|
||||
|
||||
/*
|
||||
* CEPH_IOC_GET_DATALOC - get location of file data in the cluster
|
||||
*
|
||||
* Extract identity, address of the OSD and object storing a given
|
||||
* file offset.
|
||||
*/
|
||||
struct ceph_ioctl_dataloc {
|
||||
__u64 file_offset; /* in+out: file offset */
|
||||
__u64 object_offset; /* out: offset in object */
|
||||
__u64 object_no; /* out: object # */
|
||||
__u64 object_size; /* out: object size */
|
||||
char object_name[64]; /* out: object name */
|
||||
__u64 block_offset; /* out: offset in block */
|
||||
__u64 block_size; /* out: block length */
|
||||
__s64 osd; /* out: osd # */
|
||||
struct sockaddr_storage osd_addr; /* out: osd address */
|
||||
};
|
||||
|
||||
#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
|
||||
struct ceph_ioctl_dataloc)
|
||||
|
||||
/*
|
||||
* CEPH_IOC_LAZYIO - relax consistency
|
||||
*
|
||||
* Normally Ceph switches to synchronous IO when multiple clients have
|
||||
* the file open (and or more for write). Reads and writes bypass the
|
||||
* page cache and go directly to the OSD. Setting this flag on a file
|
||||
* descriptor will allow buffered IO for this file in cases where the
|
||||
* application knows it won't interfere with other nodes (or doesn't
|
||||
* care).
|
||||
*/
|
||||
#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
|
||||
|
||||
/*
|
||||
* CEPH_IOC_SYNCIO - force synchronous IO
|
||||
*
|
||||
* This ioctl sets a file flag that forces the synchronous IO that
|
||||
* bypasses the page cache, even if it is not necessary. This is
|
||||
* essentially the opposite behavior of IOC_LAZYIO. This forces the
|
||||
* same read/write path as a file opened by multiple clients when one
|
||||
* or more of those clients is opened for write.
|
||||
*
|
||||
* Note that this type of sync IO takes a different path than a file
|
||||
* opened with O_SYNC/D_SYNC (writes hit the page cache and are
|
||||
* immediately flushed on page boundaries). It is very similar to
|
||||
* O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes
|
||||
* are not copied (user page must remain stable) and O_DIRECT writes
|
||||
* have alignment restrictions (on the buffer and file offset).
|
||||
*/
|
||||
#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
|
||||
|
||||
#endif
|
332
fs/ceph/locks.c
Normal file
332
fs/ceph/locks.c
Normal file
|
@ -0,0 +1,332 @@
|
|||
#include <linux/ceph/ceph_debug.h>
|
||||
|
||||
#include <linux/file.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/random.h>
|
||||
|
||||
#include "super.h"
|
||||
#include "mds_client.h"
|
||||
#include <linux/ceph/pagelist.h>
|
||||
|
||||
static u64 lock_secret;
|
||||
|
||||
static inline u64 secure_addr(void *addr)
|
||||
{
|
||||
u64 v = lock_secret ^ (u64)(unsigned long)addr;
|
||||
/*
|
||||
* Set the most significant bit, so that MDS knows the 'owner'
|
||||
* is sufficient to identify the owner of lock. (old code uses
|
||||
* both 'owner' and 'pid')
|
||||
*/
|
||||
v |= (1ULL << 63);
|
||||
return v;
|
||||
}
|
||||
|
||||
void __init ceph_flock_init(void)
|
||||
{
|
||||
get_random_bytes(&lock_secret, sizeof(lock_secret));
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement fcntl and flock locking functions.
|
||||
*/
|
||||
static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
|
||||
int cmd, u8 wait, struct file_lock *fl)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||
struct ceph_mds_request *req;
|
||||
int err;
|
||||
u64 length = 0;
|
||||
u64 owner;
|
||||
|
||||
req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
req->r_inode = inode;
|
||||
ihold(inode);
|
||||
req->r_num_caps = 1;
|
||||
|
||||
/* mds requires start and length rather than start and end */
|
||||
if (LLONG_MAX == fl->fl_end)
|
||||
length = 0;
|
||||
else
|
||||
length = fl->fl_end - fl->fl_start + 1;
|
||||
|
||||
owner = secure_addr(fl->fl_owner);
|
||||
|
||||
dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
|
||||
"start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
|
||||
(int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
|
||||
wait, fl->fl_type);
|
||||
|
||||
req->r_args.filelock_change.rule = lock_type;
|
||||
req->r_args.filelock_change.type = cmd;
|
||||
req->r_args.filelock_change.owner = cpu_to_le64(owner);
|
||||
req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
|
||||
req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
|
||||
req->r_args.filelock_change.length = cpu_to_le64(length);
|
||||
req->r_args.filelock_change.wait = wait;
|
||||
|
||||
err = ceph_mdsc_do_request(mdsc, inode, req);
|
||||
|
||||
if (operation == CEPH_MDS_OP_GETFILELOCK) {
|
||||
fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
|
||||
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
|
||||
fl->fl_type = F_RDLCK;
|
||||
else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
|
||||
fl->fl_type = F_WRLCK;
|
||||
else
|
||||
fl->fl_type = F_UNLCK;
|
||||
|
||||
fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
|
||||
length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
|
||||
le64_to_cpu(req->r_reply_info.filelock_reply->length);
|
||||
if (length >= 1)
|
||||
fl->fl_end = length -1;
|
||||
else
|
||||
fl->fl_end = 0;
|
||||
|
||||
}
|
||||
ceph_mdsc_put_request(req);
|
||||
dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
|
||||
"length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
|
||||
(int)operation, (u64)fl->fl_pid, fl->fl_start,
|
||||
length, wait, fl->fl_type, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to set an fcntl lock.
|
||||
* For now, this just goes away to the server. Later it may be more awesome.
|
||||
*/
|
||||
int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
|
||||
{
|
||||
u8 lock_cmd;
|
||||
int err;
|
||||
u8 wait = 0;
|
||||
u16 op = CEPH_MDS_OP_SETFILELOCK;
|
||||
|
||||
if (!(fl->fl_flags & FL_POSIX))
|
||||
return -ENOLCK;
|
||||
/* No mandatory locks */
|
||||
if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
|
||||
return -ENOLCK;
|
||||
|
||||
dout("ceph_lock, fl_owner: %p", fl->fl_owner);
|
||||
|
||||
/* set wait bit as appropriate, then make command as Ceph expects it*/
|
||||
if (IS_GETLK(cmd))
|
||||
op = CEPH_MDS_OP_GETFILELOCK;
|
||||
else if (IS_SETLKW(cmd))
|
||||
wait = 1;
|
||||
|
||||
if (F_RDLCK == fl->fl_type)
|
||||
lock_cmd = CEPH_LOCK_SHARED;
|
||||
else if (F_WRLCK == fl->fl_type)
|
||||
lock_cmd = CEPH_LOCK_EXCL;
|
||||
else
|
||||
lock_cmd = CEPH_LOCK_UNLOCK;
|
||||
|
||||
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
|
||||
if (!err) {
|
||||
if (op != CEPH_MDS_OP_GETFILELOCK) {
|
||||
dout("mds locked, locking locally");
|
||||
err = posix_lock_file(file, fl, NULL);
|
||||
if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
|
||||
/* undo! This should only happen if
|
||||
* the kernel detects local
|
||||
* deadlock. */
|
||||
ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
|
||||
CEPH_LOCK_UNLOCK, 0, fl);
|
||||
dout("got %d on posix_lock_file, undid lock",
|
||||
err);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (err == -ERESTARTSYS) {
|
||||
dout("undoing lock\n");
|
||||
ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
|
||||
CEPH_LOCK_UNLOCK, 0, fl);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
|
||||
{
|
||||
u8 lock_cmd;
|
||||
int err;
|
||||
u8 wait = 0;
|
||||
|
||||
if (!(fl->fl_flags & FL_FLOCK))
|
||||
return -ENOLCK;
|
||||
/* No mandatory locks */
|
||||
if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
|
||||
return -ENOLCK;
|
||||
|
||||
dout("ceph_flock, fl_file: %p", fl->fl_file);
|
||||
|
||||
if (IS_SETLKW(cmd))
|
||||
wait = 1;
|
||||
|
||||
if (F_RDLCK == fl->fl_type)
|
||||
lock_cmd = CEPH_LOCK_SHARED;
|
||||
else if (F_WRLCK == fl->fl_type)
|
||||
lock_cmd = CEPH_LOCK_EXCL;
|
||||
else
|
||||
lock_cmd = CEPH_LOCK_UNLOCK;
|
||||
|
||||
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
|
||||
file, lock_cmd, wait, fl);
|
||||
if (!err) {
|
||||
err = flock_lock_file_wait(file, fl);
|
||||
if (err) {
|
||||
ceph_lock_message(CEPH_LOCK_FLOCK,
|
||||
CEPH_MDS_OP_SETFILELOCK,
|
||||
file, CEPH_LOCK_UNLOCK, 0, fl);
|
||||
dout("got %d on flock_lock_file_wait, undid lock", err);
|
||||
}
|
||||
} else if (err == -ERESTARTSYS) {
|
||||
dout("undoing lock\n");
|
||||
ceph_lock_message(CEPH_LOCK_FLOCK,
|
||||
CEPH_MDS_OP_SETFILELOCK,
|
||||
file, CEPH_LOCK_UNLOCK, 0, fl);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be called with lock_flocks() already held. Fills in the passed
|
||||
* counter variables, so you can prepare pagelist metadata before calling
|
||||
* ceph_encode_locks.
|
||||
*/
|
||||
void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
|
||||
{
|
||||
struct file_lock *lock;
|
||||
|
||||
*fcntl_count = 0;
|
||||
*flock_count = 0;
|
||||
|
||||
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
|
||||
if (lock->fl_flags & FL_POSIX)
|
||||
++(*fcntl_count);
|
||||
else if (lock->fl_flags & FL_FLOCK)
|
||||
++(*flock_count);
|
||||
}
|
||||
dout("counted %d flock locks and %d fcntl locks",
|
||||
*flock_count, *fcntl_count);
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode the flock and fcntl locks for the given inode into the ceph_filelock
|
||||
* array. Must be called with inode->i_lock already held.
|
||||
* If we encounter more of a specific lock type than expected, return -ENOSPC.
|
||||
*/
|
||||
int ceph_encode_locks_to_buffer(struct inode *inode,
|
||||
struct ceph_filelock *flocks,
|
||||
int num_fcntl_locks, int num_flock_locks)
|
||||
{
|
||||
struct file_lock *lock;
|
||||
int err = 0;
|
||||
int seen_fcntl = 0;
|
||||
int seen_flock = 0;
|
||||
int l = 0;
|
||||
|
||||
dout("encoding %d flock and %d fcntl locks", num_flock_locks,
|
||||
num_fcntl_locks);
|
||||
|
||||
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
|
||||
if (lock->fl_flags & FL_POSIX) {
|
||||
++seen_fcntl;
|
||||
if (seen_fcntl > num_fcntl_locks) {
|
||||
err = -ENOSPC;
|
||||
goto fail;
|
||||
}
|
||||
err = lock_to_ceph_filelock(lock, &flocks[l]);
|
||||
if (err)
|
||||
goto fail;
|
||||
++l;
|
||||
}
|
||||
}
|
||||
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
|
||||
if (lock->fl_flags & FL_FLOCK) {
|
||||
++seen_flock;
|
||||
if (seen_flock > num_flock_locks) {
|
||||
err = -ENOSPC;
|
||||
goto fail;
|
||||
}
|
||||
err = lock_to_ceph_filelock(lock, &flocks[l]);
|
||||
if (err)
|
||||
goto fail;
|
||||
++l;
|
||||
}
|
||||
}
|
||||
fail:
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy the encoded flock and fcntl locks into the pagelist.
|
||||
* Format is: #fcntl locks, sequential fcntl locks, #flock locks,
|
||||
* sequential flock locks.
|
||||
* Returns zero on success.
|
||||
*/
|
||||
int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
|
||||
struct ceph_pagelist *pagelist,
|
||||
int num_fcntl_locks, int num_flock_locks)
|
||||
{
|
||||
int err = 0;
|
||||
__le32 nlocks;
|
||||
|
||||
nlocks = cpu_to_le32(num_fcntl_locks);
|
||||
err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
|
||||
if (err)
|
||||
goto out_fail;
|
||||
|
||||
err = ceph_pagelist_append(pagelist, flocks,
|
||||
num_fcntl_locks * sizeof(*flocks));
|
||||
if (err)
|
||||
goto out_fail;
|
||||
|
||||
nlocks = cpu_to_le32(num_flock_locks);
|
||||
err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
|
||||
if (err)
|
||||
goto out_fail;
|
||||
|
||||
err = ceph_pagelist_append(pagelist,
|
||||
&flocks[num_fcntl_locks],
|
||||
num_flock_locks * sizeof(*flocks));
|
||||
out_fail:
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a pointer to a lock, convert it to a ceph filelock
|
||||
*/
|
||||
int lock_to_ceph_filelock(struct file_lock *lock,
|
||||
struct ceph_filelock *cephlock)
|
||||
{
|
||||
int err = 0;
|
||||
cephlock->start = cpu_to_le64(lock->fl_start);
|
||||
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
|
||||
cephlock->client = cpu_to_le64(0);
|
||||
cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
|
||||
cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
|
||||
|
||||
switch (lock->fl_type) {
|
||||
case F_RDLCK:
|
||||
cephlock->type = CEPH_LOCK_SHARED;
|
||||
break;
|
||||
case F_WRLCK:
|
||||
cephlock->type = CEPH_LOCK_EXCL;
|
||||
break;
|
||||
case F_UNLCK:
|
||||
cephlock->type = CEPH_LOCK_UNLOCK;
|
||||
break;
|
||||
default:
|
||||
dout("Have unknown lock type %d", lock->fl_type);
|
||||
err = -EINVAL;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
3758
fs/ceph/mds_client.c
Normal file
3758
fs/ceph/mds_client.c
Normal file
File diff suppressed because it is too large
Load diff
394
fs/ceph/mds_client.h
Normal file
394
fs/ceph/mds_client.h
Normal file
|
@ -0,0 +1,394 @@
|
|||
#ifndef _FS_CEPH_MDS_CLIENT_H
|
||||
#define _FS_CEPH_MDS_CLIENT_H
|
||||
|
||||
#include <linux/completion.h>
|
||||
#include <linux/kref.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
#include <linux/ceph/types.h>
|
||||
#include <linux/ceph/messenger.h>
|
||||
#include <linux/ceph/mdsmap.h>
|
||||
#include <linux/ceph/auth.h>
|
||||
|
||||
/*
|
||||
* Some lock dependencies:
|
||||
*
|
||||
* session->s_mutex
|
||||
* mdsc->mutex
|
||||
*
|
||||
* mdsc->snap_rwsem
|
||||
*
|
||||
* ci->i_ceph_lock
|
||||
* mdsc->snap_flush_lock
|
||||
* mdsc->cap_delay_lock
|
||||
*
|
||||
*/
|
||||
|
||||
struct ceph_fs_client;
|
||||
struct ceph_cap;
|
||||
|
||||
/*
|
||||
* parsed info about a single inode. pointers are into the encoded
|
||||
* on-wire structures within the mds reply message payload.
|
||||
*/
|
||||
struct ceph_mds_reply_info_in {
|
||||
struct ceph_mds_reply_inode *in;
|
||||
struct ceph_dir_layout dir_layout;
|
||||
u32 symlink_len;
|
||||
char *symlink;
|
||||
u32 xattr_len;
|
||||
char *xattr_data;
|
||||
};
|
||||
|
||||
/*
|
||||
* parsed info about an mds reply, including information about
|
||||
* either: 1) the target inode and/or its parent directory and dentry,
|
||||
* and directory contents (for readdir results), or
|
||||
* 2) the file range lock info (for fcntl F_GETLK results).
|
||||
*/
|
||||
struct ceph_mds_reply_info_parsed {
|
||||
struct ceph_mds_reply_head *head;
|
||||
|
||||
/* trace */
|
||||
struct ceph_mds_reply_info_in diri, targeti;
|
||||
struct ceph_mds_reply_dirfrag *dirfrag;
|
||||
char *dname;
|
||||
u32 dname_len;
|
||||
struct ceph_mds_reply_lease *dlease;
|
||||
|
||||
/* extra */
|
||||
union {
|
||||
/* for fcntl F_GETLK results */
|
||||
struct ceph_filelock *filelock_reply;
|
||||
|
||||
/* for readdir results */
|
||||
struct {
|
||||
struct ceph_mds_reply_dirfrag *dir_dir;
|
||||
size_t dir_buf_size;
|
||||
int dir_nr;
|
||||
char **dir_dname;
|
||||
u32 *dir_dname_len;
|
||||
struct ceph_mds_reply_lease **dir_dlease;
|
||||
struct ceph_mds_reply_info_in *dir_in;
|
||||
u8 dir_complete, dir_end;
|
||||
};
|
||||
|
||||
/* for create results */
|
||||
struct {
|
||||
bool has_create_ino;
|
||||
u64 ino;
|
||||
};
|
||||
};
|
||||
|
||||
/* encoded blob describing snapshot contexts for certain
|
||||
operations (e.g., open) */
|
||||
void *snapblob;
|
||||
int snapblob_len;
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* cap releases are batched and sent to the MDS en masse.
|
||||
*/
|
||||
#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
|
||||
sizeof(struct ceph_mds_cap_release)) / \
|
||||
sizeof(struct ceph_mds_cap_item))
|
||||
|
||||
|
||||
/*
|
||||
* state associated with each MDS<->client session
|
||||
*/
|
||||
enum {
|
||||
CEPH_MDS_SESSION_NEW = 1,
|
||||
CEPH_MDS_SESSION_OPENING = 2,
|
||||
CEPH_MDS_SESSION_OPEN = 3,
|
||||
CEPH_MDS_SESSION_HUNG = 4,
|
||||
CEPH_MDS_SESSION_CLOSING = 5,
|
||||
CEPH_MDS_SESSION_RESTARTING = 6,
|
||||
CEPH_MDS_SESSION_RECONNECTING = 7,
|
||||
};
|
||||
|
||||
struct ceph_mds_session {
|
||||
struct ceph_mds_client *s_mdsc;
|
||||
int s_mds;
|
||||
int s_state;
|
||||
unsigned long s_ttl; /* time until mds kills us */
|
||||
u64 s_seq; /* incoming msg seq # */
|
||||
struct mutex s_mutex; /* serialize session messages */
|
||||
|
||||
struct ceph_connection s_con;
|
||||
|
||||
struct ceph_auth_handshake s_auth;
|
||||
|
||||
/* protected by s_gen_ttl_lock */
|
||||
spinlock_t s_gen_ttl_lock;
|
||||
u32 s_cap_gen; /* inc each time we get mds stale msg */
|
||||
unsigned long s_cap_ttl; /* when session caps expire */
|
||||
|
||||
/* protected by s_cap_lock */
|
||||
spinlock_t s_cap_lock;
|
||||
struct list_head s_caps; /* all caps issued by this session */
|
||||
int s_nr_caps, s_trim_caps;
|
||||
int s_num_cap_releases;
|
||||
int s_cap_reconnect;
|
||||
struct list_head s_cap_releases; /* waiting cap_release messages */
|
||||
struct list_head s_cap_releases_done; /* ready to send */
|
||||
struct ceph_cap *s_cap_iterator;
|
||||
|
||||
/* protected by mutex */
|
||||
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
|
||||
struct list_head s_cap_snaps_flushing;
|
||||
unsigned long s_renew_requested; /* last time we sent a renew req */
|
||||
u64 s_renew_seq;
|
||||
|
||||
atomic_t s_ref;
|
||||
struct list_head s_waiting; /* waiting requests */
|
||||
struct list_head s_unsafe; /* unsafe requests */
|
||||
};
|
||||
|
||||
/*
|
||||
* modes of choosing which MDS to send a request to
|
||||
*/
|
||||
enum {
|
||||
USE_ANY_MDS,
|
||||
USE_RANDOM_MDS,
|
||||
USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
|
||||
};
|
||||
|
||||
struct ceph_mds_request;
|
||||
struct ceph_mds_client;
|
||||
|
||||
/*
|
||||
* request completion callback
|
||||
*/
|
||||
typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_request *req);
|
||||
|
||||
/*
|
||||
* an in-flight mds request
|
||||
*/
|
||||
struct ceph_mds_request {
|
||||
u64 r_tid; /* transaction id */
|
||||
struct rb_node r_node;
|
||||
struct ceph_mds_client *r_mdsc;
|
||||
|
||||
int r_op; /* mds op code */
|
||||
|
||||
/* operation on what? */
|
||||
struct inode *r_inode; /* arg1 */
|
||||
struct dentry *r_dentry; /* arg1 */
|
||||
struct dentry *r_old_dentry; /* arg2: rename from or link from */
|
||||
struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */
|
||||
char *r_path1, *r_path2;
|
||||
struct ceph_vino r_ino1, r_ino2;
|
||||
|
||||
struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
|
||||
struct inode *r_target_inode; /* resulting inode */
|
||||
|
||||
struct mutex r_fill_mutex;
|
||||
|
||||
union ceph_mds_request_args r_args;
|
||||
int r_fmode; /* file mode, if expecting cap */
|
||||
kuid_t r_uid;
|
||||
kgid_t r_gid;
|
||||
struct timespec r_stamp;
|
||||
|
||||
/* for choosing which mds to send this request to */
|
||||
int r_direct_mode;
|
||||
u32 r_direct_hash; /* choose dir frag based on this dentry hash */
|
||||
bool r_direct_is_hash; /* true if r_direct_hash is valid */
|
||||
|
||||
/* data payload is used for xattr ops */
|
||||
struct ceph_pagelist *r_pagelist;
|
||||
|
||||
/* what caps shall we drop? */
|
||||
int r_inode_drop, r_inode_unless;
|
||||
int r_dentry_drop, r_dentry_unless;
|
||||
int r_old_dentry_drop, r_old_dentry_unless;
|
||||
struct inode *r_old_inode;
|
||||
int r_old_inode_drop, r_old_inode_unless;
|
||||
|
||||
struct ceph_msg *r_request; /* original request */
|
||||
int r_request_release_offset;
|
||||
struct ceph_msg *r_reply;
|
||||
struct ceph_mds_reply_info_parsed r_reply_info;
|
||||
int r_err;
|
||||
bool r_aborted;
|
||||
|
||||
unsigned long r_timeout; /* optional. jiffies */
|
||||
unsigned long r_started; /* start time to measure timeout against */
|
||||
unsigned long r_request_started; /* start time for mds request only,
|
||||
used to measure lease durations */
|
||||
|
||||
/* link unsafe requests to parent directory, for fsync */
|
||||
struct inode *r_unsafe_dir;
|
||||
struct list_head r_unsafe_dir_item;
|
||||
|
||||
struct ceph_mds_session *r_session;
|
||||
|
||||
int r_attempts; /* resend attempts */
|
||||
int r_num_fwd; /* number of forward attempts */
|
||||
int r_resend_mds; /* mds to resend to next, if any*/
|
||||
u32 r_sent_on_mseq; /* cap mseq request was sent at*/
|
||||
|
||||
struct kref r_kref;
|
||||
struct list_head r_wait;
|
||||
struct completion r_completion;
|
||||
struct completion r_safe_completion;
|
||||
ceph_mds_request_callback_t r_callback;
|
||||
struct list_head r_unsafe_item; /* per-session unsafe list item */
|
||||
bool r_got_unsafe, r_got_safe, r_got_result;
|
||||
|
||||
bool r_did_prepopulate;
|
||||
u32 r_readdir_offset;
|
||||
|
||||
struct ceph_cap_reservation r_caps_reservation;
|
||||
int r_num_caps;
|
||||
};
|
||||
|
||||
/*
|
||||
* mds client state
|
||||
*/
|
||||
struct ceph_mds_client {
|
||||
struct ceph_fs_client *fsc;
|
||||
struct mutex mutex; /* all nested structures */
|
||||
|
||||
struct ceph_mdsmap *mdsmap;
|
||||
struct completion safe_umount_waiters;
|
||||
wait_queue_head_t session_close_wq;
|
||||
struct list_head waiting_for_map;
|
||||
|
||||
struct ceph_mds_session **sessions; /* NULL for mds if no session */
|
||||
int max_sessions; /* len of s_mds_sessions */
|
||||
int stopping; /* true if shutting down */
|
||||
|
||||
/*
|
||||
* snap_rwsem will cover cap linkage into snaprealms, and
|
||||
* realm snap contexts. (later, we can do per-realm snap
|
||||
* contexts locks..) the empty list contains realms with no
|
||||
* references (implying they contain no inodes with caps) that
|
||||
* should be destroyed.
|
||||
*/
|
||||
struct rw_semaphore snap_rwsem;
|
||||
struct rb_root snap_realms;
|
||||
struct list_head snap_empty;
|
||||
spinlock_t snap_empty_lock; /* protect snap_empty */
|
||||
|
||||
u64 last_tid; /* most recent mds request */
|
||||
struct rb_root request_tree; /* pending mds requests */
|
||||
struct delayed_work delayed_work; /* delayed work */
|
||||
unsigned long last_renew_caps; /* last time we renewed our caps */
|
||||
struct list_head cap_delay_list; /* caps with delayed release */
|
||||
spinlock_t cap_delay_lock; /* protects cap_delay_list */
|
||||
struct list_head snap_flush_list; /* cap_snaps ready to flush */
|
||||
spinlock_t snap_flush_lock;
|
||||
|
||||
u64 cap_flush_seq;
|
||||
struct list_head cap_dirty; /* inodes with dirty caps */
|
||||
struct list_head cap_dirty_migrating; /* ...that are migration... */
|
||||
int num_cap_flushing; /* # caps we are flushing */
|
||||
spinlock_t cap_dirty_lock; /* protects above items */
|
||||
wait_queue_head_t cap_flushing_wq;
|
||||
|
||||
/*
|
||||
* Cap reservations
|
||||
*
|
||||
* Maintain a global pool of preallocated struct ceph_caps, referenced
|
||||
* by struct ceph_caps_reservations. This ensures that we preallocate
|
||||
* memory needed to successfully process an MDS response. (If an MDS
|
||||
* sends us cap information and we fail to process it, we will have
|
||||
* problems due to the client and MDS being out of sync.)
|
||||
*
|
||||
* Reservations are 'owned' by a ceph_cap_reservation context.
|
||||
*/
|
||||
spinlock_t caps_list_lock;
|
||||
struct list_head caps_list; /* unused (reserved or
|
||||
unreserved) */
|
||||
int caps_total_count; /* total caps allocated */
|
||||
int caps_use_count; /* in use */
|
||||
int caps_reserve_count; /* unused, reserved */
|
||||
int caps_avail_count; /* unused, unreserved */
|
||||
int caps_min_count; /* keep at least this many
|
||||
(unreserved) */
|
||||
spinlock_t dentry_lru_lock;
|
||||
struct list_head dentry_lru;
|
||||
int num_dentry;
|
||||
};
|
||||
|
||||
extern const char *ceph_mds_op_name(int op);
|
||||
|
||||
extern struct ceph_mds_session *
|
||||
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
|
||||
|
||||
static inline struct ceph_mds_session *
|
||||
ceph_get_mds_session(struct ceph_mds_session *s)
|
||||
{
|
||||
atomic_inc(&s->s_ref);
|
||||
return s;
|
||||
}
|
||||
|
||||
extern const char *ceph_session_state_name(int s);
|
||||
|
||||
extern void ceph_put_mds_session(struct ceph_mds_session *s);
|
||||
|
||||
extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
|
||||
struct ceph_msg *msg, int mds);
|
||||
|
||||
extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
|
||||
extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
|
||||
extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
|
||||
|
||||
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
|
||||
|
||||
extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
|
||||
struct inode *inode,
|
||||
struct dentry *dn);
|
||||
|
||||
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
|
||||
extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
|
||||
struct inode *dir);
|
||||
extern struct ceph_mds_request *
|
||||
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
|
||||
extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_request *req);
|
||||
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
|
||||
struct inode *dir,
|
||||
struct ceph_mds_request *req);
|
||||
static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
|
||||
{
|
||||
kref_get(&req->r_kref);
|
||||
}
|
||||
extern void ceph_mdsc_release_request(struct kref *kref);
|
||||
static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
|
||||
{
|
||||
kref_put(&req->r_kref, ceph_mdsc_release_request);
|
||||
}
|
||||
|
||||
extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session);
|
||||
extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session);
|
||||
|
||||
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
|
||||
|
||||
extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
|
||||
int stop_on_nosnap);
|
||||
|
||||
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
|
||||
extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
|
||||
struct inode *inode,
|
||||
struct dentry *dentry, char action,
|
||||
u32 seq);
|
||||
|
||||
extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
|
||||
struct ceph_msg *msg);
|
||||
|
||||
extern struct ceph_mds_session *
|
||||
ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
|
||||
extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session);
|
||||
|
||||
#endif
|
189
fs/ceph/mdsmap.c
Normal file
189
fs/ceph/mdsmap.c
Normal file
|
@ -0,0 +1,189 @@
|
|||
#include <linux/ceph/ceph_debug.h>
|
||||
|
||||
#include <linux/bug.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#include <linux/ceph/mdsmap.h>
|
||||
#include <linux/ceph/messenger.h>
|
||||
#include <linux/ceph/decode.h>
|
||||
|
||||
#include "super.h"
|
||||
|
||||
|
||||
/*
|
||||
* choose a random mds that is "up" (i.e. has a state > 0), or -1.
|
||||
*/
|
||||
int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
|
||||
{
|
||||
int n = 0;
|
||||
int i;
|
||||
|
||||
/* special case for one mds */
|
||||
if (1 == m->m_max_mds && m->m_info[0].state > 0)
|
||||
return 0;
|
||||
|
||||
/* count */
|
||||
for (i = 0; i < m->m_max_mds; i++)
|
||||
if (m->m_info[i].state > 0)
|
||||
n++;
|
||||
if (n == 0)
|
||||
return -1;
|
||||
|
||||
/* pick */
|
||||
n = prandom_u32() % n;
|
||||
i = 0;
|
||||
for (i = 0; n > 0; i++, n--)
|
||||
while (m->m_info[i].state <= 0)
|
||||
i++;
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decode an MDS map
|
||||
*
|
||||
* Ignore any fields we don't care about (there are quite a few of
|
||||
* them).
|
||||
*/
|
||||
struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
|
||||
{
|
||||
struct ceph_mdsmap *m;
|
||||
const void *start = *p;
|
||||
int i, j, n;
|
||||
int err = -EINVAL;
|
||||
u16 version;
|
||||
|
||||
m = kzalloc(sizeof(*m), GFP_NOFS);
|
||||
if (m == NULL)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
ceph_decode_16_safe(p, end, version, bad);
|
||||
if (version > 3) {
|
||||
pr_warn("got mdsmap version %d > 3, failing", version);
|
||||
goto bad;
|
||||
}
|
||||
|
||||
ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
|
||||
m->m_epoch = ceph_decode_32(p);
|
||||
m->m_client_epoch = ceph_decode_32(p);
|
||||
m->m_last_failure = ceph_decode_32(p);
|
||||
m->m_root = ceph_decode_32(p);
|
||||
m->m_session_timeout = ceph_decode_32(p);
|
||||
m->m_session_autoclose = ceph_decode_32(p);
|
||||
m->m_max_file_size = ceph_decode_64(p);
|
||||
m->m_max_mds = ceph_decode_32(p);
|
||||
|
||||
m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
|
||||
if (m->m_info == NULL)
|
||||
goto badmem;
|
||||
|
||||
/* pick out active nodes from mds_info (state > 0) */
|
||||
n = ceph_decode_32(p);
|
||||
for (i = 0; i < n; i++) {
|
||||
u64 global_id;
|
||||
u32 namelen;
|
||||
s32 mds, inc, state;
|
||||
u64 state_seq;
|
||||
u8 infoversion;
|
||||
struct ceph_entity_addr addr;
|
||||
u32 num_export_targets;
|
||||
void *pexport_targets = NULL;
|
||||
struct ceph_timespec laggy_since;
|
||||
struct ceph_mds_info *info;
|
||||
|
||||
ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
|
||||
global_id = ceph_decode_64(p);
|
||||
infoversion = ceph_decode_8(p);
|
||||
*p += sizeof(u64);
|
||||
namelen = ceph_decode_32(p); /* skip mds name */
|
||||
*p += namelen;
|
||||
|
||||
ceph_decode_need(p, end,
|
||||
4*sizeof(u32) + sizeof(u64) +
|
||||
sizeof(addr) + sizeof(struct ceph_timespec),
|
||||
bad);
|
||||
mds = ceph_decode_32(p);
|
||||
inc = ceph_decode_32(p);
|
||||
state = ceph_decode_32(p);
|
||||
state_seq = ceph_decode_64(p);
|
||||
ceph_decode_copy(p, &addr, sizeof(addr));
|
||||
ceph_decode_addr(&addr);
|
||||
ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
|
||||
*p += sizeof(u32);
|
||||
ceph_decode_32_safe(p, end, namelen, bad);
|
||||
*p += namelen;
|
||||
if (infoversion >= 2) {
|
||||
ceph_decode_32_safe(p, end, num_export_targets, bad);
|
||||
pexport_targets = *p;
|
||||
*p += num_export_targets * sizeof(u32);
|
||||
} else {
|
||||
num_export_targets = 0;
|
||||
}
|
||||
|
||||
dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
|
||||
i+1, n, global_id, mds, inc,
|
||||
ceph_pr_addr(&addr.in_addr),
|
||||
ceph_mds_state_name(state));
|
||||
|
||||
if (mds < 0 || mds >= m->m_max_mds || state <= 0)
|
||||
continue;
|
||||
|
||||
info = &m->m_info[mds];
|
||||
info->global_id = global_id;
|
||||
info->state = state;
|
||||
info->addr = addr;
|
||||
info->laggy = (laggy_since.tv_sec != 0 ||
|
||||
laggy_since.tv_nsec != 0);
|
||||
info->num_export_targets = num_export_targets;
|
||||
if (num_export_targets) {
|
||||
info->export_targets = kcalloc(num_export_targets,
|
||||
sizeof(u32), GFP_NOFS);
|
||||
if (info->export_targets == NULL)
|
||||
goto badmem;
|
||||
for (j = 0; j < num_export_targets; j++)
|
||||
info->export_targets[j] =
|
||||
ceph_decode_32(&pexport_targets);
|
||||
} else {
|
||||
info->export_targets = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* pg_pools */
|
||||
ceph_decode_32_safe(p, end, n, bad);
|
||||
m->m_num_data_pg_pools = n;
|
||||
m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
|
||||
if (!m->m_data_pg_pools)
|
||||
goto badmem;
|
||||
ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
|
||||
for (i = 0; i < n; i++)
|
||||
m->m_data_pg_pools[i] = ceph_decode_64(p);
|
||||
m->m_cas_pg_pool = ceph_decode_64(p);
|
||||
|
||||
/* ok, we don't care about the rest. */
|
||||
dout("mdsmap_decode success epoch %u\n", m->m_epoch);
|
||||
return m;
|
||||
|
||||
badmem:
|
||||
err = -ENOMEM;
|
||||
bad:
|
||||
pr_err("corrupt mdsmap\n");
|
||||
print_hex_dump(KERN_DEBUG, "mdsmap: ",
|
||||
DUMP_PREFIX_OFFSET, 16, 1,
|
||||
start, end - start, true);
|
||||
ceph_mdsmap_destroy(m);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < m->m_max_mds; i++)
|
||||
kfree(m->m_info[i].export_targets);
|
||||
kfree(m->m_info);
|
||||
kfree(m->m_data_pg_pools);
|
||||
kfree(m);
|
||||
}
|
932
fs/ceph/snap.c
Normal file
932
fs/ceph/snap.c
Normal file
|
@ -0,0 +1,932 @@
|
|||
#include <linux/ceph/ceph_debug.h>
|
||||
|
||||
#include <linux/sort.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "super.h"
|
||||
#include "mds_client.h"
|
||||
|
||||
#include <linux/ceph/decode.h>
|
||||
|
||||
/*
|
||||
* Snapshots in ceph are driven in large part by cooperation from the
|
||||
* client. In contrast to local file systems or file servers that
|
||||
* implement snapshots at a single point in the system, ceph's
|
||||
* distributed access to storage requires clients to help decide
|
||||
* whether a write logically occurs before or after a recently created
|
||||
* snapshot.
|
||||
*
|
||||
* This provides a perfect instantanous client-wide snapshot. Between
|
||||
* clients, however, snapshots may appear to be applied at slightly
|
||||
* different points in time, depending on delays in delivering the
|
||||
* snapshot notification.
|
||||
*
|
||||
* Snapshots are _not_ file system-wide. Instead, each snapshot
|
||||
* applies to the subdirectory nested beneath some directory. This
|
||||
* effectively divides the hierarchy into multiple "realms," where all
|
||||
* of the files contained by each realm share the same set of
|
||||
* snapshots. An individual realm's snap set contains snapshots
|
||||
* explicitly created on that realm, as well as any snaps in its
|
||||
* parent's snap set _after_ the point at which the parent became it's
|
||||
* parent (due to, say, a rename). Similarly, snaps from prior parents
|
||||
* during the time intervals during which they were the parent are included.
|
||||
*
|
||||
* The client is spared most of this detail, fortunately... it must only
|
||||
* maintains a hierarchy of realms reflecting the current parent/child
|
||||
* realm relationship, and for each realm has an explicit list of snaps
|
||||
* inherited from prior parents.
|
||||
*
|
||||
* A snap_realm struct is maintained for realms containing every inode
|
||||
* with an open cap in the system. (The needed snap realm information is
|
||||
* provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
|
||||
* version number is used to ensure that as realm parameters change (new
|
||||
* snapshot, new parent, etc.) the client's realm hierarchy is updated.
|
||||
*
|
||||
* The realm hierarchy drives the generation of a 'snap context' for each
|
||||
* realm, which simply lists the resulting set of snaps for the realm. This
|
||||
* is attached to any writes sent to OSDs.
|
||||
*/
|
||||
/*
|
||||
* Unfortunately error handling is a bit mixed here. If we get a snap
|
||||
* update, but don't have enough memory to update our realm hierarchy,
|
||||
* it's not clear what we can do about it (besides complaining to the
|
||||
* console).
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* increase ref count for the realm
|
||||
*
|
||||
* caller must hold snap_rwsem for write.
|
||||
*/
|
||||
void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
|
||||
struct ceph_snap_realm *realm)
|
||||
{
|
||||
dout("get_realm %p %d -> %d\n", realm,
|
||||
atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
|
||||
/*
|
||||
* since we _only_ increment realm refs or empty the empty
|
||||
* list with snap_rwsem held, adjusting the empty list here is
|
||||
* safe. we do need to protect against concurrent empty list
|
||||
* additions, however.
|
||||
*/
|
||||
if (atomic_read(&realm->nref) == 0) {
|
||||
spin_lock(&mdsc->snap_empty_lock);
|
||||
list_del_init(&realm->empty_item);
|
||||
spin_unlock(&mdsc->snap_empty_lock);
|
||||
}
|
||||
|
||||
atomic_inc(&realm->nref);
|
||||
}
|
||||
|
||||
static void __insert_snap_realm(struct rb_root *root,
|
||||
struct ceph_snap_realm *new)
|
||||
{
|
||||
struct rb_node **p = &root->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct ceph_snap_realm *r = NULL;
|
||||
|
||||
while (*p) {
|
||||
parent = *p;
|
||||
r = rb_entry(parent, struct ceph_snap_realm, node);
|
||||
if (new->ino < r->ino)
|
||||
p = &(*p)->rb_left;
|
||||
else if (new->ino > r->ino)
|
||||
p = &(*p)->rb_right;
|
||||
else
|
||||
BUG();
|
||||
}
|
||||
|
||||
rb_link_node(&new->node, parent, p);
|
||||
rb_insert_color(&new->node, root);
|
||||
}
|
||||
|
||||
/*
|
||||
* create and get the realm rooted at @ino and bump its ref count.
|
||||
*
|
||||
* caller must hold snap_rwsem for write.
|
||||
*/
|
||||
static struct ceph_snap_realm *ceph_create_snap_realm(
|
||||
struct ceph_mds_client *mdsc,
|
||||
u64 ino)
|
||||
{
|
||||
struct ceph_snap_realm *realm;
|
||||
|
||||
realm = kzalloc(sizeof(*realm), GFP_NOFS);
|
||||
if (!realm)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
atomic_set(&realm->nref, 0); /* tree does not take a ref */
|
||||
realm->ino = ino;
|
||||
INIT_LIST_HEAD(&realm->children);
|
||||
INIT_LIST_HEAD(&realm->child_item);
|
||||
INIT_LIST_HEAD(&realm->empty_item);
|
||||
INIT_LIST_HEAD(&realm->dirty_item);
|
||||
INIT_LIST_HEAD(&realm->inodes_with_caps);
|
||||
spin_lock_init(&realm->inodes_with_caps_lock);
|
||||
__insert_snap_realm(&mdsc->snap_realms, realm);
|
||||
dout("create_snap_realm %llx %p\n", realm->ino, realm);
|
||||
return realm;
|
||||
}
|
||||
|
||||
/*
|
||||
* lookup the realm rooted at @ino.
|
||||
*
|
||||
* caller must hold snap_rwsem for write.
|
||||
*/
|
||||
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
|
||||
u64 ino)
|
||||
{
|
||||
struct rb_node *n = mdsc->snap_realms.rb_node;
|
||||
struct ceph_snap_realm *r;
|
||||
|
||||
while (n) {
|
||||
r = rb_entry(n, struct ceph_snap_realm, node);
|
||||
if (ino < r->ino)
|
||||
n = n->rb_left;
|
||||
else if (ino > r->ino)
|
||||
n = n->rb_right;
|
||||
else {
|
||||
dout("lookup_snap_realm %llx %p\n", r->ino, r);
|
||||
return r;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void __put_snap_realm(struct ceph_mds_client *mdsc,
|
||||
struct ceph_snap_realm *realm);
|
||||
|
||||
/*
|
||||
* called with snap_rwsem (write)
|
||||
*/
|
||||
static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
|
||||
struct ceph_snap_realm *realm)
|
||||
{
|
||||
dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
|
||||
|
||||
rb_erase(&realm->node, &mdsc->snap_realms);
|
||||
|
||||
if (realm->parent) {
|
||||
list_del_init(&realm->child_item);
|
||||
__put_snap_realm(mdsc, realm->parent);
|
||||
}
|
||||
|
||||
kfree(realm->prior_parent_snaps);
|
||||
kfree(realm->snaps);
|
||||
ceph_put_snap_context(realm->cached_context);
|
||||
kfree(realm);
|
||||
}
|
||||
|
||||
/*
|
||||
* caller holds snap_rwsem (write)
|
||||
*/
|
||||
static void __put_snap_realm(struct ceph_mds_client *mdsc,
|
||||
struct ceph_snap_realm *realm)
|
||||
{
|
||||
dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
|
||||
atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
|
||||
if (atomic_dec_and_test(&realm->nref))
|
||||
__destroy_snap_realm(mdsc, realm);
|
||||
}
|
||||
|
||||
/*
|
||||
* caller needn't hold any locks
|
||||
*/
|
||||
void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
|
||||
struct ceph_snap_realm *realm)
|
||||
{
|
||||
dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
|
||||
atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
|
||||
if (!atomic_dec_and_test(&realm->nref))
|
||||
return;
|
||||
|
||||
if (down_write_trylock(&mdsc->snap_rwsem)) {
|
||||
__destroy_snap_realm(mdsc, realm);
|
||||
up_write(&mdsc->snap_rwsem);
|
||||
} else {
|
||||
spin_lock(&mdsc->snap_empty_lock);
|
||||
list_add(&realm->empty_item, &mdsc->snap_empty);
|
||||
spin_unlock(&mdsc->snap_empty_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Clean up any realms whose ref counts have dropped to zero. Note
|
||||
* that this does not include realms who were created but not yet
|
||||
* used.
|
||||
*
|
||||
* Called under snap_rwsem (write)
|
||||
*/
|
||||
static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
|
||||
{
|
||||
struct ceph_snap_realm *realm;
|
||||
|
||||
spin_lock(&mdsc->snap_empty_lock);
|
||||
while (!list_empty(&mdsc->snap_empty)) {
|
||||
realm = list_first_entry(&mdsc->snap_empty,
|
||||
struct ceph_snap_realm, empty_item);
|
||||
list_del(&realm->empty_item);
|
||||
spin_unlock(&mdsc->snap_empty_lock);
|
||||
__destroy_snap_realm(mdsc, realm);
|
||||
spin_lock(&mdsc->snap_empty_lock);
|
||||
}
|
||||
spin_unlock(&mdsc->snap_empty_lock);
|
||||
}
|
||||
|
||||
void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
|
||||
{
|
||||
down_write(&mdsc->snap_rwsem);
|
||||
__cleanup_empty_realms(mdsc);
|
||||
up_write(&mdsc->snap_rwsem);
|
||||
}
|
||||
|
||||
/*
|
||||
* adjust the parent realm of a given @realm. adjust child list, and parent
|
||||
* pointers, and ref counts appropriately.
|
||||
*
|
||||
* return true if parent was changed, 0 if unchanged, <0 on error.
|
||||
*
|
||||
* caller must hold snap_rwsem for write.
|
||||
*/
|
||||
static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
|
||||
struct ceph_snap_realm *realm,
|
||||
u64 parentino)
|
||||
{
|
||||
struct ceph_snap_realm *parent;
|
||||
|
||||
if (realm->parent_ino == parentino)
|
||||
return 0;
|
||||
|
||||
parent = ceph_lookup_snap_realm(mdsc, parentino);
|
||||
if (!parent) {
|
||||
parent = ceph_create_snap_realm(mdsc, parentino);
|
||||
if (IS_ERR(parent))
|
||||
return PTR_ERR(parent);
|
||||
}
|
||||
dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
|
||||
realm->ino, realm, realm->parent_ino, realm->parent,
|
||||
parentino, parent);
|
||||
if (realm->parent) {
|
||||
list_del_init(&realm->child_item);
|
||||
ceph_put_snap_realm(mdsc, realm->parent);
|
||||
}
|
||||
realm->parent_ino = parentino;
|
||||
realm->parent = parent;
|
||||
ceph_get_snap_realm(mdsc, parent);
|
||||
list_add(&realm->child_item, &parent->children);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static int cmpu64_rev(const void *a, const void *b)
|
||||
{
|
||||
if (*(u64 *)a < *(u64 *)b)
|
||||
return 1;
|
||||
if (*(u64 *)a > *(u64 *)b)
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* build the snap context for a given realm.
|
||||
*/
|
||||
static int build_snap_context(struct ceph_snap_realm *realm)
|
||||
{
|
||||
struct ceph_snap_realm *parent = realm->parent;
|
||||
struct ceph_snap_context *snapc;
|
||||
int err = 0;
|
||||
u32 num = realm->num_prior_parent_snaps + realm->num_snaps;
|
||||
|
||||
/*
|
||||
* build parent context, if it hasn't been built.
|
||||
* conservatively estimate that all parent snaps might be
|
||||
* included by us.
|
||||
*/
|
||||
if (parent) {
|
||||
if (!parent->cached_context) {
|
||||
err = build_snap_context(parent);
|
||||
if (err)
|
||||
goto fail;
|
||||
}
|
||||
num += parent->cached_context->num_snaps;
|
||||
}
|
||||
|
||||
/* do i actually need to update? not if my context seq
|
||||
matches realm seq, and my parents' does to. (this works
|
||||
because we rebuild_snap_realms() works _downward_ in
|
||||
hierarchy after each update.) */
|
||||
if (realm->cached_context &&
|
||||
realm->cached_context->seq == realm->seq &&
|
||||
(!parent ||
|
||||
realm->cached_context->seq >= parent->cached_context->seq)) {
|
||||
dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
|
||||
" (unchanged)\n",
|
||||
realm->ino, realm, realm->cached_context,
|
||||
realm->cached_context->seq,
|
||||
(unsigned int) realm->cached_context->num_snaps);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* alloc new snap context */
|
||||
err = -ENOMEM;
|
||||
if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
|
||||
goto fail;
|
||||
snapc = ceph_create_snap_context(num, GFP_NOFS);
|
||||
if (!snapc)
|
||||
goto fail;
|
||||
|
||||
/* build (reverse sorted) snap vector */
|
||||
num = 0;
|
||||
snapc->seq = realm->seq;
|
||||
if (parent) {
|
||||
u32 i;
|
||||
|
||||
/* include any of parent's snaps occurring _after_ my
|
||||
parent became my parent */
|
||||
for (i = 0; i < parent->cached_context->num_snaps; i++)
|
||||
if (parent->cached_context->snaps[i] >=
|
||||
realm->parent_since)
|
||||
snapc->snaps[num++] =
|
||||
parent->cached_context->snaps[i];
|
||||
if (parent->cached_context->seq > snapc->seq)
|
||||
snapc->seq = parent->cached_context->seq;
|
||||
}
|
||||
memcpy(snapc->snaps + num, realm->snaps,
|
||||
sizeof(u64)*realm->num_snaps);
|
||||
num += realm->num_snaps;
|
||||
memcpy(snapc->snaps + num, realm->prior_parent_snaps,
|
||||
sizeof(u64)*realm->num_prior_parent_snaps);
|
||||
num += realm->num_prior_parent_snaps;
|
||||
|
||||
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
|
||||
snapc->num_snaps = num;
|
||||
dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
|
||||
realm->ino, realm, snapc, snapc->seq,
|
||||
(unsigned int) snapc->num_snaps);
|
||||
|
||||
if (realm->cached_context)
|
||||
ceph_put_snap_context(realm->cached_context);
|
||||
realm->cached_context = snapc;
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
/*
|
||||
* if we fail, clear old (incorrect) cached_context... hopefully
|
||||
* we'll have better luck building it later
|
||||
*/
|
||||
if (realm->cached_context) {
|
||||
ceph_put_snap_context(realm->cached_context);
|
||||
realm->cached_context = NULL;
|
||||
}
|
||||
pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
|
||||
realm, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* rebuild snap context for the given realm and all of its children.
|
||||
*/
|
||||
static void rebuild_snap_realms(struct ceph_snap_realm *realm)
|
||||
{
|
||||
struct ceph_snap_realm *child;
|
||||
|
||||
dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
|
||||
build_snap_context(realm);
|
||||
|
||||
list_for_each_entry(child, &realm->children, child_item)
|
||||
rebuild_snap_realms(child);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* helper to allocate and decode an array of snapids. free prior
|
||||
* instance, if any.
|
||||
*/
|
||||
static int dup_array(u64 **dst, __le64 *src, u32 num)
|
||||
{
|
||||
u32 i;
|
||||
|
||||
kfree(*dst);
|
||||
if (num) {
|
||||
*dst = kcalloc(num, sizeof(u64), GFP_NOFS);
|
||||
if (!*dst)
|
||||
return -ENOMEM;
|
||||
for (i = 0; i < num; i++)
|
||||
(*dst)[i] = get_unaligned_le64(src + i);
|
||||
} else {
|
||||
*dst = NULL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* When a snapshot is applied, the size/mtime inode metadata is queued
|
||||
* in a ceph_cap_snap (one for each snapshot) until writeback
|
||||
* completes and the metadata can be flushed back to the MDS.
|
||||
*
|
||||
* However, if a (sync) write is currently in-progress when we apply
|
||||
* the snapshot, we have to wait until the write succeeds or fails
|
||||
* (and a final size/mtime is known). In this case the
|
||||
* cap_snap->writing = 1, and is said to be "pending." When the write
|
||||
* finishes, we __ceph_finish_cap_snap().
|
||||
*
|
||||
* Caller must hold snap_rwsem for read (i.e., the realm topology won't
|
||||
* change).
|
||||
*/
|
||||
void ceph_queue_cap_snap(struct ceph_inode_info *ci)
|
||||
{
|
||||
struct inode *inode = &ci->vfs_inode;
|
||||
struct ceph_cap_snap *capsnap;
|
||||
int used, dirty;
|
||||
|
||||
capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
|
||||
if (!capsnap) {
|
||||
pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
used = __ceph_caps_used(ci);
|
||||
dirty = __ceph_caps_dirty(ci);
|
||||
|
||||
/*
|
||||
* If there is a write in progress, treat that as a dirty Fw,
|
||||
* even though it hasn't completed yet; by the time we finish
|
||||
* up this capsnap it will be.
|
||||
*/
|
||||
if (used & CEPH_CAP_FILE_WR)
|
||||
dirty |= CEPH_CAP_FILE_WR;
|
||||
|
||||
if (__ceph_have_pending_cap_snap(ci)) {
|
||||
/* there is no point in queuing multiple "pending" cap_snaps,
|
||||
as no new writes are allowed to start when pending, so any
|
||||
writes in progress now were started before the previous
|
||||
cap_snap. lucky us. */
|
||||
dout("queue_cap_snap %p already pending\n", inode);
|
||||
kfree(capsnap);
|
||||
} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
|
||||
CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
|
||||
struct ceph_snap_context *snapc = ci->i_head_snapc;
|
||||
|
||||
/*
|
||||
* if we are a sync write, we may need to go to the snaprealm
|
||||
* to get the current snapc.
|
||||
*/
|
||||
if (!snapc)
|
||||
snapc = ci->i_snap_realm->cached_context;
|
||||
|
||||
dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
|
||||
inode, capsnap, snapc, ceph_cap_string(dirty));
|
||||
ihold(inode);
|
||||
|
||||
atomic_set(&capsnap->nref, 1);
|
||||
capsnap->ci = ci;
|
||||
INIT_LIST_HEAD(&capsnap->ci_item);
|
||||
INIT_LIST_HEAD(&capsnap->flushing_item);
|
||||
|
||||
capsnap->follows = snapc->seq;
|
||||
capsnap->issued = __ceph_caps_issued(ci, NULL);
|
||||
capsnap->dirty = dirty;
|
||||
|
||||
capsnap->mode = inode->i_mode;
|
||||
capsnap->uid = inode->i_uid;
|
||||
capsnap->gid = inode->i_gid;
|
||||
|
||||
if (dirty & CEPH_CAP_XATTR_EXCL) {
|
||||
__ceph_build_xattrs_blob(ci);
|
||||
capsnap->xattr_blob =
|
||||
ceph_buffer_get(ci->i_xattrs.blob);
|
||||
capsnap->xattr_version = ci->i_xattrs.version;
|
||||
} else {
|
||||
capsnap->xattr_blob = NULL;
|
||||
capsnap->xattr_version = 0;
|
||||
}
|
||||
|
||||
/* dirty page count moved from _head to this cap_snap;
|
||||
all subsequent writes page dirties occur _after_ this
|
||||
snapshot. */
|
||||
capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
|
||||
ci->i_wrbuffer_ref_head = 0;
|
||||
capsnap->context = snapc;
|
||||
ci->i_head_snapc =
|
||||
ceph_get_snap_context(ci->i_snap_realm->cached_context);
|
||||
dout(" new snapc is %p\n", ci->i_head_snapc);
|
||||
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
|
||||
|
||||
if (used & CEPH_CAP_FILE_WR) {
|
||||
dout("queue_cap_snap %p cap_snap %p snapc %p"
|
||||
" seq %llu used WR, now pending\n", inode,
|
||||
capsnap, snapc, snapc->seq);
|
||||
capsnap->writing = 1;
|
||||
} else {
|
||||
/* note mtime, size NOW. */
|
||||
__ceph_finish_cap_snap(ci, capsnap);
|
||||
}
|
||||
} else {
|
||||
dout("queue_cap_snap %p nothing dirty|writing\n", inode);
|
||||
kfree(capsnap);
|
||||
}
|
||||
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Finalize the size, mtime for a cap_snap.. that is, settle on final values
|
||||
* to be used for the snapshot, to be flushed back to the mds.
|
||||
*
|
||||
* If capsnap can now be flushed, add to snap_flush list, and return 1.
|
||||
*
|
||||
* Caller must hold i_ceph_lock.
|
||||
*/
|
||||
int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
|
||||
struct ceph_cap_snap *capsnap)
|
||||
{
|
||||
struct inode *inode = &ci->vfs_inode;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||
|
||||
BUG_ON(capsnap->writing);
|
||||
capsnap->size = inode->i_size;
|
||||
capsnap->mtime = inode->i_mtime;
|
||||
capsnap->atime = inode->i_atime;
|
||||
capsnap->ctime = inode->i_ctime;
|
||||
capsnap->time_warp_seq = ci->i_time_warp_seq;
|
||||
if (capsnap->dirty_pages) {
|
||||
dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
|
||||
"still has %d dirty pages\n", inode, capsnap,
|
||||
capsnap->context, capsnap->context->seq,
|
||||
ceph_cap_string(capsnap->dirty), capsnap->size,
|
||||
capsnap->dirty_pages);
|
||||
return 0;
|
||||
}
|
||||
dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
|
||||
inode, capsnap, capsnap->context,
|
||||
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
|
||||
capsnap->size);
|
||||
|
||||
spin_lock(&mdsc->snap_flush_lock);
|
||||
list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
|
||||
spin_unlock(&mdsc->snap_flush_lock);
|
||||
return 1; /* caller may want to ceph_flush_snaps */
|
||||
}
|
||||
|
||||
/*
|
||||
* Queue cap_snaps for snap writeback for this realm and its children.
|
||||
* Called under snap_rwsem, so realm topology won't change.
|
||||
*/
|
||||
static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
|
||||
{
|
||||
struct ceph_inode_info *ci;
|
||||
struct inode *lastinode = NULL;
|
||||
struct ceph_snap_realm *child;
|
||||
|
||||
dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
|
||||
|
||||
spin_lock(&realm->inodes_with_caps_lock);
|
||||
list_for_each_entry(ci, &realm->inodes_with_caps,
|
||||
i_snap_realm_item) {
|
||||
struct inode *inode = igrab(&ci->vfs_inode);
|
||||
if (!inode)
|
||||
continue;
|
||||
spin_unlock(&realm->inodes_with_caps_lock);
|
||||
if (lastinode)
|
||||
iput(lastinode);
|
||||
lastinode = inode;
|
||||
ceph_queue_cap_snap(ci);
|
||||
spin_lock(&realm->inodes_with_caps_lock);
|
||||
}
|
||||
spin_unlock(&realm->inodes_with_caps_lock);
|
||||
if (lastinode)
|
||||
iput(lastinode);
|
||||
|
||||
list_for_each_entry(child, &realm->children, child_item) {
|
||||
dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
|
||||
realm, realm->ino, child, child->ino);
|
||||
list_del_init(&child->dirty_item);
|
||||
list_add(&child->dirty_item, &realm->dirty_item);
|
||||
}
|
||||
|
||||
list_del_init(&realm->dirty_item);
|
||||
dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse and apply a snapblob "snap trace" from the MDS. This specifies
|
||||
* the snap realm parameters from a given realm and all of its ancestors,
|
||||
* up to the root.
|
||||
*
|
||||
* Caller must hold snap_rwsem for write.
|
||||
*/
|
||||
int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
|
||||
void *p, void *e, bool deletion)
|
||||
{
|
||||
struct ceph_mds_snap_realm *ri; /* encoded */
|
||||
__le64 *snaps; /* encoded */
|
||||
__le64 *prior_parent_snaps; /* encoded */
|
||||
struct ceph_snap_realm *realm;
|
||||
int invalidate = 0;
|
||||
int err = -ENOMEM;
|
||||
LIST_HEAD(dirty_realms);
|
||||
|
||||
dout("update_snap_trace deletion=%d\n", deletion);
|
||||
more:
|
||||
ceph_decode_need(&p, e, sizeof(*ri), bad);
|
||||
ri = p;
|
||||
p += sizeof(*ri);
|
||||
ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
|
||||
le32_to_cpu(ri->num_prior_parent_snaps)), bad);
|
||||
snaps = p;
|
||||
p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
|
||||
prior_parent_snaps = p;
|
||||
p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
|
||||
|
||||
realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
|
||||
if (!realm) {
|
||||
realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
|
||||
if (IS_ERR(realm)) {
|
||||
err = PTR_ERR(realm);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
/* ensure the parent is correct */
|
||||
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
|
||||
if (err < 0)
|
||||
goto fail;
|
||||
invalidate += err;
|
||||
|
||||
if (le64_to_cpu(ri->seq) > realm->seq) {
|
||||
dout("update_snap_trace updating %llx %p %lld -> %lld\n",
|
||||
realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
|
||||
/* update realm parameters, snap lists */
|
||||
realm->seq = le64_to_cpu(ri->seq);
|
||||
realm->created = le64_to_cpu(ri->created);
|
||||
realm->parent_since = le64_to_cpu(ri->parent_since);
|
||||
|
||||
realm->num_snaps = le32_to_cpu(ri->num_snaps);
|
||||
err = dup_array(&realm->snaps, snaps, realm->num_snaps);
|
||||
if (err < 0)
|
||||
goto fail;
|
||||
|
||||
realm->num_prior_parent_snaps =
|
||||
le32_to_cpu(ri->num_prior_parent_snaps);
|
||||
err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
|
||||
realm->num_prior_parent_snaps);
|
||||
if (err < 0)
|
||||
goto fail;
|
||||
|
||||
/* queue realm for cap_snap creation */
|
||||
list_add(&realm->dirty_item, &dirty_realms);
|
||||
|
||||
invalidate = 1;
|
||||
} else if (!realm->cached_context) {
|
||||
dout("update_snap_trace %llx %p seq %lld new\n",
|
||||
realm->ino, realm, realm->seq);
|
||||
invalidate = 1;
|
||||
} else {
|
||||
dout("update_snap_trace %llx %p seq %lld unchanged\n",
|
||||
realm->ino, realm, realm->seq);
|
||||
}
|
||||
|
||||
dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
|
||||
realm, invalidate, p, e);
|
||||
|
||||
if (p < e)
|
||||
goto more;
|
||||
|
||||
/* invalidate when we reach the _end_ (root) of the trace */
|
||||
if (invalidate)
|
||||
rebuild_snap_realms(realm);
|
||||
|
||||
/*
|
||||
* queue cap snaps _after_ we've built the new snap contexts,
|
||||
* so that i_head_snapc can be set appropriately.
|
||||
*/
|
||||
while (!list_empty(&dirty_realms)) {
|
||||
realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
|
||||
dirty_item);
|
||||
queue_realm_cap_snaps(realm);
|
||||
}
|
||||
|
||||
__cleanup_empty_realms(mdsc);
|
||||
return 0;
|
||||
|
||||
bad:
|
||||
err = -EINVAL;
|
||||
fail:
|
||||
pr_err("update_snap_trace error %d\n", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Send any cap_snaps that are queued for flush. Try to carry
|
||||
* s_mutex across multiple snap flushes to avoid locking overhead.
|
||||
*
|
||||
* Caller holds no locks.
|
||||
*/
|
||||
static void flush_snaps(struct ceph_mds_client *mdsc)
|
||||
{
|
||||
struct ceph_inode_info *ci;
|
||||
struct inode *inode;
|
||||
struct ceph_mds_session *session = NULL;
|
||||
|
||||
dout("flush_snaps\n");
|
||||
spin_lock(&mdsc->snap_flush_lock);
|
||||
while (!list_empty(&mdsc->snap_flush_list)) {
|
||||
ci = list_first_entry(&mdsc->snap_flush_list,
|
||||
struct ceph_inode_info, i_snap_flush_item);
|
||||
inode = &ci->vfs_inode;
|
||||
ihold(inode);
|
||||
spin_unlock(&mdsc->snap_flush_lock);
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
__ceph_flush_snaps(ci, &session, 0);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
iput(inode);
|
||||
spin_lock(&mdsc->snap_flush_lock);
|
||||
}
|
||||
spin_unlock(&mdsc->snap_flush_lock);
|
||||
|
||||
if (session) {
|
||||
mutex_unlock(&session->s_mutex);
|
||||
ceph_put_mds_session(session);
|
||||
}
|
||||
dout("flush_snaps done\n");
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Handle a snap notification from the MDS.
|
||||
*
|
||||
* This can take two basic forms: the simplest is just a snap creation
|
||||
* or deletion notification on an existing realm. This should update the
|
||||
* realm and its children.
|
||||
*
|
||||
* The more difficult case is realm creation, due to snap creation at a
|
||||
* new point in the file hierarchy, or due to a rename that moves a file or
|
||||
* directory into another realm.
|
||||
*/
|
||||
void ceph_handle_snap(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session,
|
||||
struct ceph_msg *msg)
|
||||
{
|
||||
struct super_block *sb = mdsc->fsc->sb;
|
||||
int mds = session->s_mds;
|
||||
u64 split;
|
||||
int op;
|
||||
int trace_len;
|
||||
struct ceph_snap_realm *realm = NULL;
|
||||
void *p = msg->front.iov_base;
|
||||
void *e = p + msg->front.iov_len;
|
||||
struct ceph_mds_snap_head *h;
|
||||
int num_split_inos, num_split_realms;
|
||||
__le64 *split_inos = NULL, *split_realms = NULL;
|
||||
int i;
|
||||
int locked_rwsem = 0;
|
||||
|
||||
/* decode */
|
||||
if (msg->front.iov_len < sizeof(*h))
|
||||
goto bad;
|
||||
h = p;
|
||||
op = le32_to_cpu(h->op);
|
||||
split = le64_to_cpu(h->split); /* non-zero if we are splitting an
|
||||
* existing realm */
|
||||
num_split_inos = le32_to_cpu(h->num_split_inos);
|
||||
num_split_realms = le32_to_cpu(h->num_split_realms);
|
||||
trace_len = le32_to_cpu(h->trace_len);
|
||||
p += sizeof(*h);
|
||||
|
||||
dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
|
||||
ceph_snap_op_name(op), split, trace_len);
|
||||
|
||||
mutex_lock(&session->s_mutex);
|
||||
session->s_seq++;
|
||||
mutex_unlock(&session->s_mutex);
|
||||
|
||||
down_write(&mdsc->snap_rwsem);
|
||||
locked_rwsem = 1;
|
||||
|
||||
if (op == CEPH_SNAP_OP_SPLIT) {
|
||||
struct ceph_mds_snap_realm *ri;
|
||||
|
||||
/*
|
||||
* A "split" breaks part of an existing realm off into
|
||||
* a new realm. The MDS provides a list of inodes
|
||||
* (with caps) and child realms that belong to the new
|
||||
* child.
|
||||
*/
|
||||
split_inos = p;
|
||||
p += sizeof(u64) * num_split_inos;
|
||||
split_realms = p;
|
||||
p += sizeof(u64) * num_split_realms;
|
||||
ceph_decode_need(&p, e, sizeof(*ri), bad);
|
||||
/* we will peek at realm info here, but will _not_
|
||||
* advance p, as the realm update will occur below in
|
||||
* ceph_update_snap_trace. */
|
||||
ri = p;
|
||||
|
||||
realm = ceph_lookup_snap_realm(mdsc, split);
|
||||
if (!realm) {
|
||||
realm = ceph_create_snap_realm(mdsc, split);
|
||||
if (IS_ERR(realm))
|
||||
goto out;
|
||||
}
|
||||
ceph_get_snap_realm(mdsc, realm);
|
||||
|
||||
dout("splitting snap_realm %llx %p\n", realm->ino, realm);
|
||||
for (i = 0; i < num_split_inos; i++) {
|
||||
struct ceph_vino vino = {
|
||||
.ino = le64_to_cpu(split_inos[i]),
|
||||
.snap = CEPH_NOSNAP,
|
||||
};
|
||||
struct inode *inode = ceph_find_inode(sb, vino);
|
||||
struct ceph_inode_info *ci;
|
||||
struct ceph_snap_realm *oldrealm;
|
||||
|
||||
if (!inode)
|
||||
continue;
|
||||
ci = ceph_inode(inode);
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (!ci->i_snap_realm)
|
||||
goto skip_inode;
|
||||
/*
|
||||
* If this inode belongs to a realm that was
|
||||
* created after our new realm, we experienced
|
||||
* a race (due to another split notifications
|
||||
* arriving from a different MDS). So skip
|
||||
* this inode.
|
||||
*/
|
||||
if (ci->i_snap_realm->created >
|
||||
le64_to_cpu(ri->created)) {
|
||||
dout(" leaving %p in newer realm %llx %p\n",
|
||||
inode, ci->i_snap_realm->ino,
|
||||
ci->i_snap_realm);
|
||||
goto skip_inode;
|
||||
}
|
||||
dout(" will move %p to split realm %llx %p\n",
|
||||
inode, realm->ino, realm);
|
||||
/*
|
||||
* Move the inode to the new realm
|
||||
*/
|
||||
spin_lock(&realm->inodes_with_caps_lock);
|
||||
list_del_init(&ci->i_snap_realm_item);
|
||||
list_add(&ci->i_snap_realm_item,
|
||||
&realm->inodes_with_caps);
|
||||
oldrealm = ci->i_snap_realm;
|
||||
ci->i_snap_realm = realm;
|
||||
spin_unlock(&realm->inodes_with_caps_lock);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
ceph_get_snap_realm(mdsc, realm);
|
||||
ceph_put_snap_realm(mdsc, oldrealm);
|
||||
|
||||
iput(inode);
|
||||
continue;
|
||||
|
||||
skip_inode:
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
iput(inode);
|
||||
}
|
||||
|
||||
/* we may have taken some of the old realm's children. */
|
||||
for (i = 0; i < num_split_realms; i++) {
|
||||
struct ceph_snap_realm *child =
|
||||
ceph_lookup_snap_realm(mdsc,
|
||||
le64_to_cpu(split_realms[i]));
|
||||
if (!child)
|
||||
continue;
|
||||
adjust_snap_realm_parent(mdsc, child, realm->ino);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* update using the provided snap trace. if we are deleting a
|
||||
* snap, we can avoid queueing cap_snaps.
|
||||
*/
|
||||
ceph_update_snap_trace(mdsc, p, e,
|
||||
op == CEPH_SNAP_OP_DESTROY);
|
||||
|
||||
if (op == CEPH_SNAP_OP_SPLIT)
|
||||
/* we took a reference when we created the realm, above */
|
||||
ceph_put_snap_realm(mdsc, realm);
|
||||
|
||||
__cleanup_empty_realms(mdsc);
|
||||
|
||||
up_write(&mdsc->snap_rwsem);
|
||||
|
||||
flush_snaps(mdsc);
|
||||
return;
|
||||
|
||||
bad:
|
||||
pr_err("corrupt snap message from mds%d\n", mds);
|
||||
ceph_msg_dump(msg);
|
||||
out:
|
||||
if (locked_rwsem)
|
||||
up_write(&mdsc->snap_rwsem);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
124
fs/ceph/strings.c
Normal file
124
fs/ceph/strings.c
Normal file
|
@ -0,0 +1,124 @@
|
|||
/*
|
||||
* Ceph fs string constants
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/ceph/types.h>
|
||||
|
||||
|
||||
const char *ceph_mds_state_name(int s)
|
||||
{
|
||||
switch (s) {
|
||||
/* down and out */
|
||||
case CEPH_MDS_STATE_DNE: return "down:dne";
|
||||
case CEPH_MDS_STATE_STOPPED: return "down:stopped";
|
||||
/* up and out */
|
||||
case CEPH_MDS_STATE_BOOT: return "up:boot";
|
||||
case CEPH_MDS_STATE_STANDBY: return "up:standby";
|
||||
case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
|
||||
case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
|
||||
case CEPH_MDS_STATE_CREATING: return "up:creating";
|
||||
case CEPH_MDS_STATE_STARTING: return "up:starting";
|
||||
/* up and in */
|
||||
case CEPH_MDS_STATE_REPLAY: return "up:replay";
|
||||
case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
|
||||
case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
|
||||
case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
|
||||
case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
|
||||
case CEPH_MDS_STATE_ACTIVE: return "up:active";
|
||||
case CEPH_MDS_STATE_STOPPING: return "up:stopping";
|
||||
}
|
||||
return "???";
|
||||
}
|
||||
|
||||
const char *ceph_session_op_name(int op)
|
||||
{
|
||||
switch (op) {
|
||||
case CEPH_SESSION_REQUEST_OPEN: return "request_open";
|
||||
case CEPH_SESSION_OPEN: return "open";
|
||||
case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
|
||||
case CEPH_SESSION_CLOSE: return "close";
|
||||
case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
|
||||
case CEPH_SESSION_RENEWCAPS: return "renewcaps";
|
||||
case CEPH_SESSION_STALE: return "stale";
|
||||
case CEPH_SESSION_RECALL_STATE: return "recall_state";
|
||||
case CEPH_SESSION_FLUSHMSG: return "flushmsg";
|
||||
case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
|
||||
}
|
||||
return "???";
|
||||
}
|
||||
|
||||
const char *ceph_mds_op_name(int op)
|
||||
{
|
||||
switch (op) {
|
||||
case CEPH_MDS_OP_LOOKUP: return "lookup";
|
||||
case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
|
||||
case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
|
||||
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
|
||||
case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
|
||||
case CEPH_MDS_OP_GETATTR: return "getattr";
|
||||
case CEPH_MDS_OP_SETXATTR: return "setxattr";
|
||||
case CEPH_MDS_OP_SETATTR: return "setattr";
|
||||
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
|
||||
case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
|
||||
case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
|
||||
case CEPH_MDS_OP_READDIR: return "readdir";
|
||||
case CEPH_MDS_OP_MKNOD: return "mknod";
|
||||
case CEPH_MDS_OP_LINK: return "link";
|
||||
case CEPH_MDS_OP_UNLINK: return "unlink";
|
||||
case CEPH_MDS_OP_RENAME: return "rename";
|
||||
case CEPH_MDS_OP_MKDIR: return "mkdir";
|
||||
case CEPH_MDS_OP_RMDIR: return "rmdir";
|
||||
case CEPH_MDS_OP_SYMLINK: return "symlink";
|
||||
case CEPH_MDS_OP_CREATE: return "create";
|
||||
case CEPH_MDS_OP_OPEN: return "open";
|
||||
case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
|
||||
case CEPH_MDS_OP_LSSNAP: return "lssnap";
|
||||
case CEPH_MDS_OP_MKSNAP: return "mksnap";
|
||||
case CEPH_MDS_OP_RMSNAP: return "rmsnap";
|
||||
case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
|
||||
case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
|
||||
}
|
||||
return "???";
|
||||
}
|
||||
|
||||
const char *ceph_cap_op_name(int op)
|
||||
{
|
||||
switch (op) {
|
||||
case CEPH_CAP_OP_GRANT: return "grant";
|
||||
case CEPH_CAP_OP_REVOKE: return "revoke";
|
||||
case CEPH_CAP_OP_TRUNC: return "trunc";
|
||||
case CEPH_CAP_OP_EXPORT: return "export";
|
||||
case CEPH_CAP_OP_IMPORT: return "import";
|
||||
case CEPH_CAP_OP_UPDATE: return "update";
|
||||
case CEPH_CAP_OP_DROP: return "drop";
|
||||
case CEPH_CAP_OP_FLUSH: return "flush";
|
||||
case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
|
||||
case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
|
||||
case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
|
||||
case CEPH_CAP_OP_RELEASE: return "release";
|
||||
case CEPH_CAP_OP_RENEW: return "renew";
|
||||
}
|
||||
return "???";
|
||||
}
|
||||
|
||||
const char *ceph_lease_op_name(int o)
|
||||
{
|
||||
switch (o) {
|
||||
case CEPH_MDS_LEASE_REVOKE: return "revoke";
|
||||
case CEPH_MDS_LEASE_RELEASE: return "release";
|
||||
case CEPH_MDS_LEASE_RENEW: return "renew";
|
||||
case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
|
||||
}
|
||||
return "???";
|
||||
}
|
||||
|
||||
const char *ceph_snap_op_name(int o)
|
||||
{
|
||||
switch (o) {
|
||||
case CEPH_SNAP_OP_UPDATE: return "update";
|
||||
case CEPH_SNAP_OP_CREATE: return "create";
|
||||
case CEPH_SNAP_OP_DESTROY: return "destroy";
|
||||
case CEPH_SNAP_OP_SPLIT: return "split";
|
||||
}
|
||||
return "???";
|
||||
}
|
1061
fs/ceph/super.c
Normal file
1061
fs/ceph/super.c
Normal file
File diff suppressed because it is too large
Load diff
906
fs/ceph/super.h
Normal file
906
fs/ceph/super.h
Normal file
|
@ -0,0 +1,906 @@
|
|||
#ifndef _FS_CEPH_SUPER_H
|
||||
#define _FS_CEPH_SUPER_H
|
||||
|
||||
#include <linux/ceph/ceph_debug.h>
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/completion.h>
|
||||
#include <linux/exportfs.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/posix_acl.h>
|
||||
|
||||
#include <linux/ceph/libceph.h>
|
||||
|
||||
#ifdef CONFIG_CEPH_FSCACHE
|
||||
#include <linux/fscache.h>
|
||||
#endif
|
||||
|
||||
/* f_type in struct statfs */
|
||||
#define CEPH_SUPER_MAGIC 0x00c36400
|
||||
|
||||
/* large granularity for statfs utilization stats to facilitate
|
||||
* large volume sizes on 32-bit machines. */
|
||||
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
|
||||
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
|
||||
|
||||
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
|
||||
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
|
||||
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
|
||||
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
|
||||
#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
|
||||
#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
|
||||
|
||||
#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
|
||||
|
||||
#define ceph_set_mount_opt(fsc, opt) \
|
||||
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
|
||||
#define ceph_test_mount_opt(fsc, opt) \
|
||||
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
|
||||
|
||||
#define CEPH_RSIZE_DEFAULT 0 /* max read size */
|
||||
#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */
|
||||
#define CEPH_MAX_READDIR_DEFAULT 1024
|
||||
#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
|
||||
#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
|
||||
|
||||
struct ceph_mount_options {
|
||||
int flags;
|
||||
int sb_flags;
|
||||
|
||||
int wsize; /* max write size */
|
||||
int rsize; /* max read size */
|
||||
int rasize; /* max readahead */
|
||||
int congestion_kb; /* max writeback in flight */
|
||||
int caps_wanted_delay_min, caps_wanted_delay_max;
|
||||
int cap_release_safety;
|
||||
int max_readdir; /* max readdir result (entires) */
|
||||
int max_readdir_bytes; /* max readdir result (bytes) */
|
||||
|
||||
/*
|
||||
* everything above this point can be memcmp'd; everything below
|
||||
* is handled in compare_mount_options()
|
||||
*/
|
||||
|
||||
char *snapdir_name; /* default ".snap" */
|
||||
};
|
||||
|
||||
struct ceph_fs_client {
|
||||
struct super_block *sb;
|
||||
|
||||
struct ceph_mount_options *mount_options;
|
||||
struct ceph_client *client;
|
||||
|
||||
unsigned long mount_state;
|
||||
int min_caps; /* min caps i added */
|
||||
|
||||
struct ceph_mds_client *mdsc;
|
||||
|
||||
/* writeback */
|
||||
mempool_t *wb_pagevec_pool;
|
||||
struct workqueue_struct *wb_wq;
|
||||
struct workqueue_struct *pg_inv_wq;
|
||||
struct workqueue_struct *trunc_wq;
|
||||
atomic_long_t writeback_count;
|
||||
|
||||
struct backing_dev_info backing_dev_info;
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
struct dentry *debugfs_dentry_lru, *debugfs_caps;
|
||||
struct dentry *debugfs_congestion_kb;
|
||||
struct dentry *debugfs_bdi;
|
||||
struct dentry *debugfs_mdsc, *debugfs_mdsmap;
|
||||
struct dentry *debugfs_mds_sessions;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CEPH_FSCACHE
|
||||
struct fscache_cookie *fscache;
|
||||
struct workqueue_struct *revalidate_wq;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* File i/o capability. This tracks shared state with the metadata
|
||||
* server that allows us to cache or writeback attributes or to read
|
||||
* and write data. For any given inode, we should have one or more
|
||||
* capabilities, one issued by each metadata server, and our
|
||||
* cumulative access is the OR of all issued capabilities.
|
||||
*
|
||||
* Each cap is referenced by the inode's i_caps rbtree and by per-mds
|
||||
* session capability lists.
|
||||
*/
|
||||
struct ceph_cap {
|
||||
struct ceph_inode_info *ci;
|
||||
struct rb_node ci_node; /* per-ci cap tree */
|
||||
struct ceph_mds_session *session;
|
||||
struct list_head session_caps; /* per-session caplist */
|
||||
int mds;
|
||||
u64 cap_id; /* unique cap id (mds provided) */
|
||||
int issued; /* latest, from the mds */
|
||||
int implemented; /* implemented superset of issued (for revocation) */
|
||||
int mds_wanted;
|
||||
u32 seq, issue_seq, mseq;
|
||||
u32 cap_gen; /* active/stale cycle */
|
||||
unsigned long last_used;
|
||||
struct list_head caps_item;
|
||||
};
|
||||
|
||||
#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
|
||||
#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
|
||||
#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
|
||||
|
||||
/*
|
||||
* Snapped cap state that is pending flush to mds. When a snapshot occurs,
|
||||
* we first complete any in-process sync writes and writeback any dirty
|
||||
* data before flushing the snapped state (tracked here) back to the MDS.
|
||||
*/
|
||||
struct ceph_cap_snap {
|
||||
atomic_t nref;
|
||||
struct ceph_inode_info *ci;
|
||||
struct list_head ci_item, flushing_item;
|
||||
|
||||
u64 follows, flush_tid;
|
||||
int issued, dirty;
|
||||
struct ceph_snap_context *context;
|
||||
|
||||
umode_t mode;
|
||||
kuid_t uid;
|
||||
kgid_t gid;
|
||||
|
||||
struct ceph_buffer *xattr_blob;
|
||||
u64 xattr_version;
|
||||
|
||||
u64 size;
|
||||
struct timespec mtime, atime, ctime;
|
||||
u64 time_warp_seq;
|
||||
int writing; /* a sync write is still in progress */
|
||||
int dirty_pages; /* dirty pages awaiting writeback */
|
||||
};
|
||||
|
||||
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
|
||||
{
|
||||
if (atomic_dec_and_test(&capsnap->nref)) {
|
||||
if (capsnap->xattr_blob)
|
||||
ceph_buffer_put(capsnap->xattr_blob);
|
||||
kfree(capsnap);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The frag tree describes how a directory is fragmented, potentially across
|
||||
* multiple metadata servers. It is also used to indicate points where
|
||||
* metadata authority is delegated, and whether/where metadata is replicated.
|
||||
*
|
||||
* A _leaf_ frag will be present in the i_fragtree IFF there is
|
||||
* delegation info. That is, if mds >= 0 || ndist > 0.
|
||||
*/
|
||||
#define CEPH_MAX_DIRFRAG_REP 4
|
||||
|
||||
struct ceph_inode_frag {
|
||||
struct rb_node node;
|
||||
|
||||
/* fragtree state */
|
||||
u32 frag;
|
||||
int split_by; /* i.e. 2^(split_by) children */
|
||||
|
||||
/* delegation and replication info */
|
||||
int mds; /* -1 if same authority as parent */
|
||||
int ndist; /* >0 if replicated */
|
||||
int dist[CEPH_MAX_DIRFRAG_REP];
|
||||
};
|
||||
|
||||
/*
|
||||
* We cache inode xattrs as an encoded blob until they are first used,
|
||||
* at which point we parse them into an rbtree.
|
||||
*/
|
||||
struct ceph_inode_xattr {
|
||||
struct rb_node node;
|
||||
|
||||
const char *name;
|
||||
int name_len;
|
||||
const char *val;
|
||||
int val_len;
|
||||
int dirty;
|
||||
|
||||
int should_free_name;
|
||||
int should_free_val;
|
||||
};
|
||||
|
||||
/*
|
||||
* Ceph dentry state
|
||||
*/
|
||||
struct ceph_dentry_info {
|
||||
struct ceph_mds_session *lease_session;
|
||||
u32 lease_gen, lease_shared_gen;
|
||||
u32 lease_seq;
|
||||
unsigned long lease_renew_after, lease_renew_from;
|
||||
struct list_head lru;
|
||||
struct dentry *dentry;
|
||||
u64 time;
|
||||
u64 offset;
|
||||
};
|
||||
|
||||
struct ceph_inode_xattrs_info {
|
||||
/*
|
||||
* (still encoded) xattr blob. we avoid the overhead of parsing
|
||||
* this until someone actually calls getxattr, etc.
|
||||
*
|
||||
* blob->vec.iov_len == 4 implies there are no xattrs; blob ==
|
||||
* NULL means we don't know.
|
||||
*/
|
||||
struct ceph_buffer *blob, *prealloc_blob;
|
||||
|
||||
struct rb_root index;
|
||||
bool dirty;
|
||||
int count;
|
||||
int names_size;
|
||||
int vals_size;
|
||||
u64 version, index_version;
|
||||
};
|
||||
|
||||
/*
|
||||
* Ceph inode.
|
||||
*/
|
||||
struct ceph_inode_info {
|
||||
struct ceph_vino i_vino; /* ceph ino + snap */
|
||||
|
||||
spinlock_t i_ceph_lock;
|
||||
|
||||
u64 i_version;
|
||||
u32 i_time_warp_seq;
|
||||
|
||||
unsigned i_ceph_flags;
|
||||
atomic_t i_release_count;
|
||||
atomic_t i_complete_count;
|
||||
|
||||
struct ceph_dir_layout i_dir_layout;
|
||||
struct ceph_file_layout i_layout;
|
||||
char *i_symlink;
|
||||
|
||||
/* for dirs */
|
||||
struct timespec i_rctime;
|
||||
u64 i_rbytes, i_rfiles, i_rsubdirs;
|
||||
u64 i_files, i_subdirs;
|
||||
|
||||
struct rb_root i_fragtree;
|
||||
struct mutex i_fragtree_mutex;
|
||||
|
||||
struct ceph_inode_xattrs_info i_xattrs;
|
||||
|
||||
/* capabilities. protected _both_ by i_ceph_lock and cap->session's
|
||||
* s_mutex. */
|
||||
struct rb_root i_caps; /* cap list */
|
||||
struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
|
||||
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
|
||||
struct list_head i_dirty_item, i_flushing_item;
|
||||
u64 i_cap_flush_seq;
|
||||
/* we need to track cap writeback on a per-cap-bit basis, to allow
|
||||
* overlapping, pipelined cap flushes to the mds. we can probably
|
||||
* reduce the tid to 8 bits if we're concerned about inode size. */
|
||||
u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
|
||||
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
|
||||
unsigned long i_hold_caps_min; /* jiffies */
|
||||
unsigned long i_hold_caps_max; /* jiffies */
|
||||
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
|
||||
struct ceph_cap_reservation i_cap_migration_resv;
|
||||
struct list_head i_cap_snaps; /* snapped state pending flush to mds */
|
||||
struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
|
||||
dirty|flushing caps */
|
||||
unsigned i_snap_caps; /* cap bits for snapped files */
|
||||
|
||||
int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
|
||||
|
||||
struct mutex i_truncate_mutex;
|
||||
u32 i_truncate_seq; /* last truncate to smaller size */
|
||||
u64 i_truncate_size; /* and the size we last truncated down to */
|
||||
int i_truncate_pending; /* still need to call vmtruncate */
|
||||
|
||||
u64 i_max_size; /* max file size authorized by mds */
|
||||
u64 i_reported_size; /* (max_)size reported to or requested of mds */
|
||||
u64 i_wanted_max_size; /* offset we'd like to write too */
|
||||
u64 i_requested_max_size; /* max_size we've requested */
|
||||
|
||||
/* held references to caps */
|
||||
int i_pin_ref;
|
||||
int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
|
||||
int i_wrbuffer_ref, i_wrbuffer_ref_head;
|
||||
u32 i_shared_gen; /* increment each time we get FILE_SHARED */
|
||||
u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
|
||||
u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
|
||||
|
||||
struct list_head i_unsafe_writes; /* uncommitted sync writes */
|
||||
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
|
||||
spinlock_t i_unsafe_lock;
|
||||
|
||||
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
|
||||
int i_snap_realm_counter; /* snap realm (if caps) */
|
||||
struct list_head i_snap_realm_item;
|
||||
struct list_head i_snap_flush_item;
|
||||
|
||||
struct work_struct i_wb_work; /* writeback work */
|
||||
struct work_struct i_pg_inv_work; /* page invalidation work */
|
||||
|
||||
struct work_struct i_vmtruncate_work;
|
||||
|
||||
#ifdef CONFIG_CEPH_FSCACHE
|
||||
struct fscache_cookie *fscache;
|
||||
u32 i_fscache_gen; /* sequence, for delayed fscache validate */
|
||||
struct work_struct i_revalidate_work;
|
||||
#endif
|
||||
struct inode vfs_inode; /* at end */
|
||||
};
|
||||
|
||||
static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
|
||||
{
|
||||
return container_of(inode, struct ceph_inode_info, vfs_inode);
|
||||
}
|
||||
|
||||
static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
|
||||
{
|
||||
return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
|
||||
}
|
||||
|
||||
static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
|
||||
{
|
||||
return (struct ceph_fs_client *)sb->s_fs_info;
|
||||
}
|
||||
|
||||
static inline struct ceph_vino ceph_vino(struct inode *inode)
|
||||
{
|
||||
return ceph_inode(inode)->i_vino;
|
||||
}
|
||||
|
||||
/*
|
||||
* ino_t is <64 bits on many architectures, blech.
|
||||
*
|
||||
* i_ino (kernel inode) st_ino (userspace)
|
||||
* i386 32 32
|
||||
* x86_64+ino32 64 32
|
||||
* x86_64 64 64
|
||||
*/
|
||||
static inline u32 ceph_ino_to_ino32(__u64 vino)
|
||||
{
|
||||
u32 ino = vino & 0xffffffff;
|
||||
ino ^= vino >> 32;
|
||||
if (!ino)
|
||||
ino = 2;
|
||||
return ino;
|
||||
}
|
||||
|
||||
/*
|
||||
* kernel i_ino value
|
||||
*/
|
||||
static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
|
||||
{
|
||||
#if BITS_PER_LONG == 32
|
||||
return ceph_ino_to_ino32(vino.ino);
|
||||
#else
|
||||
return (ino_t)vino.ino;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* user-visible ino (stat, filldir)
|
||||
*/
|
||||
#if BITS_PER_LONG == 32
|
||||
static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
|
||||
{
|
||||
return ino;
|
||||
}
|
||||
#else
|
||||
static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
|
||||
{
|
||||
if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
|
||||
ino = ceph_ino_to_ino32(ino);
|
||||
return ino;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* for printf-style formatting */
|
||||
#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
|
||||
|
||||
static inline u64 ceph_ino(struct inode *inode)
|
||||
{
|
||||
return ceph_inode(inode)->i_vino.ino;
|
||||
}
|
||||
static inline u64 ceph_snap(struct inode *inode)
|
||||
{
|
||||
return ceph_inode(inode)->i_vino.snap;
|
||||
}
|
||||
|
||||
static inline int ceph_ino_compare(struct inode *inode, void *data)
|
||||
{
|
||||
struct ceph_vino *pvino = (struct ceph_vino *)data;
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
return ci->i_vino.ino == pvino->ino &&
|
||||
ci->i_vino.snap == pvino->snap;
|
||||
}
|
||||
|
||||
static inline struct inode *ceph_find_inode(struct super_block *sb,
|
||||
struct ceph_vino vino)
|
||||
{
|
||||
ino_t t = ceph_vino_to_ino(vino);
|
||||
return ilookup5(sb, t, ceph_ino_compare, &vino);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Ceph inode.
|
||||
*/
|
||||
#define CEPH_I_NODELAY 4 /* do not delay cap release */
|
||||
#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
|
||||
#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
|
||||
|
||||
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
|
||||
int release_count)
|
||||
{
|
||||
atomic_set(&ci->i_complete_count, release_count);
|
||||
}
|
||||
|
||||
static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
|
||||
{
|
||||
atomic_inc(&ci->i_release_count);
|
||||
}
|
||||
|
||||
static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
|
||||
{
|
||||
return atomic_read(&ci->i_complete_count) ==
|
||||
atomic_read(&ci->i_release_count);
|
||||
}
|
||||
|
||||
static inline void ceph_dir_clear_complete(struct inode *inode)
|
||||
{
|
||||
__ceph_dir_clear_complete(ceph_inode(inode));
|
||||
}
|
||||
|
||||
static inline bool ceph_dir_is_complete(struct inode *inode)
|
||||
{
|
||||
return __ceph_dir_is_complete(ceph_inode(inode));
|
||||
}
|
||||
|
||||
|
||||
/* find a specific frag @f */
|
||||
extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
|
||||
u32 f);
|
||||
|
||||
/*
|
||||
* choose fragment for value @v. copy frag content to pfrag, if leaf
|
||||
* exists
|
||||
*/
|
||||
extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
|
||||
struct ceph_inode_frag *pfrag,
|
||||
int *found);
|
||||
|
||||
static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
|
||||
{
|
||||
return (struct ceph_dentry_info *)dentry->d_fsdata;
|
||||
}
|
||||
|
||||
static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
|
||||
{
|
||||
return ((loff_t)frag << 32) | (loff_t)off;
|
||||
}
|
||||
|
||||
/*
|
||||
* caps helpers
|
||||
*/
|
||||
static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
|
||||
{
|
||||
return !RB_EMPTY_ROOT(&ci->i_caps);
|
||||
}
|
||||
|
||||
extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
|
||||
extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
|
||||
extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
|
||||
struct ceph_cap *cap);
|
||||
|
||||
static inline int ceph_caps_issued(struct ceph_inode_info *ci)
|
||||
{
|
||||
int issued;
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
issued = __ceph_caps_issued(ci, NULL);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
return issued;
|
||||
}
|
||||
|
||||
static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
|
||||
int touch)
|
||||
{
|
||||
int r;
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
r = __ceph_caps_issued_mask(ci, mask, touch);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
|
||||
{
|
||||
return ci->i_dirty_caps | ci->i_flushing_caps;
|
||||
}
|
||||
extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
|
||||
|
||||
extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
|
||||
struct ceph_cap *ocap, int mask);
|
||||
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
|
||||
extern int __ceph_caps_used(struct ceph_inode_info *ci);
|
||||
|
||||
extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
|
||||
|
||||
/*
|
||||
* wanted, by virtue of open file modes AND cap refs (buffered/cached data)
|
||||
*/
|
||||
static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
|
||||
{
|
||||
int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
|
||||
if (w & CEPH_CAP_FILE_BUFFER)
|
||||
w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
|
||||
return w;
|
||||
}
|
||||
|
||||
/* what the mds thinks we want */
|
||||
extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
|
||||
|
||||
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
|
||||
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
|
||||
extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
|
||||
extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
|
||||
struct ceph_cap_reservation *ctx, int need);
|
||||
extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
|
||||
struct ceph_cap_reservation *ctx);
|
||||
extern void ceph_reservation_status(struct ceph_fs_client *client,
|
||||
int *total, int *avail, int *used,
|
||||
int *reserved, int *min);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* we keep buffered readdir results attached to file->private_data
|
||||
*/
|
||||
#define CEPH_F_SYNC 1
|
||||
#define CEPH_F_ATEND 2
|
||||
|
||||
struct ceph_file_info {
|
||||
short fmode; /* initialized on open */
|
||||
short flags; /* CEPH_F_* */
|
||||
|
||||
/* readdir: position within the dir */
|
||||
u32 frag;
|
||||
struct ceph_mds_request *last_readdir;
|
||||
|
||||
/* readdir: position within a frag */
|
||||
unsigned offset; /* offset of last chunk, adjusted for . and .. */
|
||||
unsigned next_offset; /* offset of next chunk (last_name's + 1) */
|
||||
char *last_name; /* last entry in previous chunk */
|
||||
struct dentry *dentry; /* next dentry (for dcache readdir) */
|
||||
int dir_release_count;
|
||||
|
||||
/* used for -o dirstat read() on directory thing */
|
||||
char *dir_info;
|
||||
int dir_info_len;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* A "snap realm" describes a subset of the file hierarchy sharing
|
||||
* the same set of snapshots that apply to it. The realms themselves
|
||||
* are organized into a hierarchy, such that children inherit (some of)
|
||||
* the snapshots of their parents.
|
||||
*
|
||||
* All inodes within the realm that have capabilities are linked into a
|
||||
* per-realm list.
|
||||
*/
|
||||
struct ceph_snap_realm {
|
||||
u64 ino;
|
||||
atomic_t nref;
|
||||
struct rb_node node;
|
||||
|
||||
u64 created, seq;
|
||||
u64 parent_ino;
|
||||
u64 parent_since; /* snapid when our current parent became so */
|
||||
|
||||
u64 *prior_parent_snaps; /* snaps inherited from any parents we */
|
||||
u32 num_prior_parent_snaps; /* had prior to parent_since */
|
||||
u64 *snaps; /* snaps specific to this realm */
|
||||
u32 num_snaps;
|
||||
|
||||
struct ceph_snap_realm *parent;
|
||||
struct list_head children; /* list of child realms */
|
||||
struct list_head child_item;
|
||||
|
||||
struct list_head empty_item; /* if i have ref==0 */
|
||||
|
||||
struct list_head dirty_item; /* if realm needs new context */
|
||||
|
||||
/* the current set of snaps for this realm */
|
||||
struct ceph_snap_context *cached_context;
|
||||
|
||||
struct list_head inodes_with_caps;
|
||||
spinlock_t inodes_with_caps_lock;
|
||||
};
|
||||
|
||||
static inline int default_congestion_kb(void)
|
||||
{
|
||||
int congestion_kb;
|
||||
|
||||
/*
|
||||
* Copied from NFS
|
||||
*
|
||||
* congestion size, scale with available memory.
|
||||
*
|
||||
* 64MB: 8192k
|
||||
* 128MB: 11585k
|
||||
* 256MB: 16384k
|
||||
* 512MB: 23170k
|
||||
* 1GB: 32768k
|
||||
* 2GB: 46340k
|
||||
* 4GB: 65536k
|
||||
* 8GB: 92681k
|
||||
* 16GB: 131072k
|
||||
*
|
||||
* This allows larger machines to have larger/more transfers.
|
||||
* Limit the default to 256M
|
||||
*/
|
||||
congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
|
||||
if (congestion_kb > 256*1024)
|
||||
congestion_kb = 256*1024;
|
||||
|
||||
return congestion_kb;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* snap.c */
|
||||
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
|
||||
u64 ino);
|
||||
extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
|
||||
struct ceph_snap_realm *realm);
|
||||
extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
|
||||
struct ceph_snap_realm *realm);
|
||||
extern int ceph_update_snap_trace(struct ceph_mds_client *m,
|
||||
void *p, void *e, bool deletion);
|
||||
extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session,
|
||||
struct ceph_msg *msg);
|
||||
extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
|
||||
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
|
||||
struct ceph_cap_snap *capsnap);
|
||||
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
|
||||
|
||||
/*
|
||||
* a cap_snap is "pending" if it is still awaiting an in-progress
|
||||
* sync write (that may/may not still update size, mtime, etc.).
|
||||
*/
|
||||
static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
|
||||
{
|
||||
return !list_empty(&ci->i_cap_snaps) &&
|
||||
list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
|
||||
ci_item)->writing;
|
||||
}
|
||||
|
||||
/* inode.c */
|
||||
extern const struct inode_operations ceph_file_iops;
|
||||
|
||||
extern struct inode *ceph_alloc_inode(struct super_block *sb);
|
||||
extern void ceph_destroy_inode(struct inode *inode);
|
||||
extern int ceph_drop_inode(struct inode *inode);
|
||||
|
||||
extern struct inode *ceph_get_inode(struct super_block *sb,
|
||||
struct ceph_vino vino);
|
||||
extern struct inode *ceph_get_snapdir(struct inode *parent);
|
||||
extern int ceph_fill_file_size(struct inode *inode, int issued,
|
||||
u32 truncate_seq, u64 truncate_size, u64 size);
|
||||
extern void ceph_fill_file_time(struct inode *inode, int issued,
|
||||
u64 time_warp_seq, struct timespec *ctime,
|
||||
struct timespec *mtime, struct timespec *atime);
|
||||
extern int ceph_fill_trace(struct super_block *sb,
|
||||
struct ceph_mds_request *req,
|
||||
struct ceph_mds_session *session);
|
||||
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
|
||||
struct ceph_mds_session *session);
|
||||
|
||||
extern int ceph_inode_holds_cap(struct inode *inode, int mask);
|
||||
|
||||
extern int ceph_inode_set_size(struct inode *inode, loff_t size);
|
||||
extern void __ceph_do_pending_vmtruncate(struct inode *inode);
|
||||
extern void ceph_queue_vmtruncate(struct inode *inode);
|
||||
|
||||
extern void ceph_queue_invalidate(struct inode *inode);
|
||||
extern void ceph_queue_writeback(struct inode *inode);
|
||||
|
||||
extern int ceph_do_getattr(struct inode *inode, int mask, bool force);
|
||||
extern int ceph_permission(struct inode *inode, int mask);
|
||||
extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat);
|
||||
|
||||
/* xattr.c */
|
||||
extern int ceph_setxattr(struct dentry *, const char *, const void *,
|
||||
size_t, int);
|
||||
int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
|
||||
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
|
||||
int __ceph_removexattr(struct dentry *, const char *);
|
||||
extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
|
||||
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
|
||||
extern int ceph_removexattr(struct dentry *, const char *);
|
||||
extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
|
||||
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
|
||||
extern void __init ceph_xattr_init(void);
|
||||
extern void ceph_xattr_exit(void);
|
||||
extern const struct xattr_handler *ceph_xattr_handlers[];
|
||||
|
||||
/* acl.c */
|
||||
struct ceph_acls_info {
|
||||
void *default_acl;
|
||||
void *acl;
|
||||
struct ceph_pagelist *pagelist;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_CEPH_FS_POSIX_ACL
|
||||
|
||||
struct posix_acl *ceph_get_acl(struct inode *, int);
|
||||
int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
|
||||
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
|
||||
struct ceph_acls_info *info);
|
||||
void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info);
|
||||
void ceph_release_acls_info(struct ceph_acls_info *info);
|
||||
|
||||
static inline void ceph_forget_all_cached_acls(struct inode *inode)
|
||||
{
|
||||
forget_all_cached_acls(inode);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ceph_get_acl NULL
|
||||
#define ceph_set_acl NULL
|
||||
|
||||
static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
|
||||
struct ceph_acls_info *info)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void ceph_init_inode_acls(struct inode *inode,
|
||||
struct ceph_acls_info *info)
|
||||
{
|
||||
}
|
||||
static inline void ceph_release_acls_info(struct ceph_acls_info *info)
|
||||
{
|
||||
}
|
||||
static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void ceph_forget_all_cached_acls(struct inode *inode)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* caps.c */
|
||||
extern const char *ceph_cap_string(int c);
|
||||
extern void ceph_handle_caps(struct ceph_mds_session *session,
|
||||
struct ceph_msg *msg);
|
||||
extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
|
||||
struct ceph_cap_reservation *ctx);
|
||||
extern void ceph_add_cap(struct inode *inode,
|
||||
struct ceph_mds_session *session, u64 cap_id,
|
||||
int fmode, unsigned issued, unsigned wanted,
|
||||
unsigned cap, unsigned seq, u64 realmino, int flags,
|
||||
struct ceph_cap **new_cap);
|
||||
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
|
||||
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
|
||||
struct ceph_cap *cap);
|
||||
extern int ceph_is_any_caps(struct inode *inode);
|
||||
|
||||
extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
|
||||
u64 cap_id, u32 migrate_seq, u32 issue_seq);
|
||||
extern void ceph_queue_caps_release(struct inode *inode);
|
||||
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
|
||||
extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync);
|
||||
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session);
|
||||
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
|
||||
int mds);
|
||||
extern int ceph_get_cap_mds(struct inode *inode);
|
||||
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
|
||||
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
|
||||
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
|
||||
struct ceph_snap_context *snapc);
|
||||
extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
|
||||
struct ceph_mds_session **psession,
|
||||
int again);
|
||||
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
|
||||
struct ceph_mds_session *session);
|
||||
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
|
||||
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
|
||||
|
||||
extern int ceph_encode_inode_release(void **p, struct inode *inode,
|
||||
int mds, int drop, int unless, int force);
|
||||
extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
|
||||
int mds, int drop, int unless);
|
||||
|
||||
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
|
||||
int *got, loff_t endoff);
|
||||
|
||||
/* for counting open files by mode */
|
||||
static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
|
||||
{
|
||||
ci->i_nr_by_mode[mode]++;
|
||||
}
|
||||
extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
|
||||
|
||||
/* addr.c */
|
||||
extern const struct address_space_operations ceph_aops;
|
||||
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
|
||||
|
||||
/* file.c */
|
||||
extern const struct file_operations ceph_file_fops;
|
||||
extern const struct address_space_operations ceph_aops;
|
||||
|
||||
extern int ceph_open(struct inode *inode, struct file *file);
|
||||
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
|
||||
struct file *file, unsigned flags, umode_t mode,
|
||||
int *opened);
|
||||
extern int ceph_release(struct inode *inode, struct file *filp);
|
||||
|
||||
/* dir.c */
|
||||
extern const struct file_operations ceph_dir_fops;
|
||||
extern const struct inode_operations ceph_dir_iops;
|
||||
extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
|
||||
ceph_snapdir_dentry_ops;
|
||||
|
||||
extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
|
||||
extern int ceph_handle_snapdir(struct ceph_mds_request *req,
|
||||
struct dentry *dentry, int err);
|
||||
extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
|
||||
struct dentry *dentry, int err);
|
||||
|
||||
extern void ceph_dentry_lru_add(struct dentry *dn);
|
||||
extern void ceph_dentry_lru_touch(struct dentry *dn);
|
||||
extern void ceph_dentry_lru_del(struct dentry *dn);
|
||||
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
|
||||
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
|
||||
extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
|
||||
|
||||
/*
|
||||
* our d_ops vary depending on whether the inode is live,
|
||||
* snapshotted (read-only), or a virtual ".snap" directory.
|
||||
*/
|
||||
int ceph_init_dentry(struct dentry *dentry);
|
||||
|
||||
|
||||
/* ioctl.c */
|
||||
extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
|
||||
|
||||
/* export.c */
|
||||
extern const struct export_operations ceph_export_ops;
|
||||
|
||||
/* locks.c */
|
||||
extern __init void ceph_flock_init(void);
|
||||
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
|
||||
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
|
||||
extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
|
||||
extern int ceph_encode_locks_to_buffer(struct inode *inode,
|
||||
struct ceph_filelock *flocks,
|
||||
int num_fcntl_locks,
|
||||
int num_flock_locks);
|
||||
extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
|
||||
struct ceph_pagelist *pagelist,
|
||||
int num_fcntl_locks, int num_flock_locks);
|
||||
extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
|
||||
|
||||
/* debugfs.c */
|
||||
extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
|
||||
extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
|
||||
|
||||
#endif /* _FS_CEPH_SUPER_H */
|
1105
fs/ceph/xattr.c
Normal file
1105
fs/ceph/xattr.c
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue