Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

40
fs/ceph/Kconfig Normal file
View file

@ -0,0 +1,40 @@
config CEPH_FS
tristate "Ceph distributed file system"
depends on INET
select CEPH_LIB
select LIBCRC32C
select CRYPTO_AES
select CRYPTO
default n
help
Choose Y or M here to include support for mounting the
experimental Ceph distributed file system. Ceph is an extremely
scalable file system designed to provide high performance,
reliable access to petabytes of storage.
More information at http://ceph.newdream.net/.
If unsure, say N.
if CEPH_FS
config CEPH_FSCACHE
bool "Enable Ceph client caching support"
depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
help
Choose Y here to enable persistent, read-only local
caching support for Ceph clients using FS-Cache
endif
config CEPH_FS_POSIX_ACL
bool "Ceph POSIX Access Control Lists"
depends on CEPH_FS
select FS_POSIX_ACL
help
POSIX Access Control Lists (ACLs) support permissions for users and
groups beyond the owner/group/world scheme.
To learn more about Access Control Lists, visit the POSIX ACLs for
Linux website <http://acl.bestbits.at/>.
If you don't know what Access Control Lists are, say N

13
fs/ceph/Makefile Normal file
View file

@ -0,0 +1,13 @@
#
# Makefile for CEPH filesystem.
#
obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o

277
fs/ceph/acl.c Normal file
View file

@ -0,0 +1,277 @@
/*
* linux/fs/ceph/acl.c
*
* Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/ceph/ceph_debug.h>
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
#include <linux/posix_acl.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include "super.h"
static inline void ceph_set_cached_acl(struct inode *inode,
int type, struct posix_acl *acl)
{
struct ceph_inode_info *ci = ceph_inode(inode);
spin_lock(&ci->i_ceph_lock);
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
set_cached_acl(inode, type, acl);
spin_unlock(&ci->i_ceph_lock);
}
static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
int type)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct posix_acl *acl = ACL_NOT_CACHED;
spin_lock(&ci->i_ceph_lock);
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
acl = get_cached_acl(inode, type);
spin_unlock(&ci->i_ceph_lock);
return acl;
}
struct posix_acl *ceph_get_acl(struct inode *inode, int type)
{
int size;
const char *name;
char *value = NULL;
struct posix_acl *acl;
switch (type) {
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
break;
case ACL_TYPE_DEFAULT:
name = POSIX_ACL_XATTR_DEFAULT;
break;
default:
BUG();
}
size = __ceph_getxattr(inode, name, "", 0);
if (size > 0) {
value = kzalloc(size, GFP_NOFS);
if (!value)
return ERR_PTR(-ENOMEM);
size = __ceph_getxattr(inode, name, value, size);
}
if (size > 0)
acl = posix_acl_from_xattr(&init_user_ns, value, size);
else if (size == -ERANGE || size == -ENODATA || size == 0)
acl = NULL;
else
acl = ERR_PTR(-EIO);
kfree(value);
if (!IS_ERR(acl))
ceph_set_cached_acl(inode, type, acl);
return acl;
}
int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
int ret = 0, size = 0;
const char *name = NULL;
char *value = NULL;
struct iattr newattrs;
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
struct dentry *dentry;
switch (type) {
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
if (acl) {
ret = posix_acl_equiv_mode(acl, &new_mode);
if (ret < 0)
goto out;
if (ret == 0)
acl = NULL;
}
break;
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode)) {
ret = acl ? -EINVAL : 0;
goto out;
}
name = POSIX_ACL_XATTR_DEFAULT;
break;
default:
ret = -EINVAL;
goto out;
}
if (acl) {
size = posix_acl_xattr_size(acl->a_count);
value = kmalloc(size, GFP_NOFS);
if (!value) {
ret = -ENOMEM;
goto out;
}
ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
if (ret < 0)
goto out_free;
}
dentry = d_find_alias(inode);
if (new_mode != old_mode) {
newattrs.ia_mode = new_mode;
newattrs.ia_valid = ATTR_MODE;
ret = ceph_setattr(dentry, &newattrs);
if (ret)
goto out_dput;
}
ret = __ceph_setxattr(dentry, name, value, size, 0);
if (ret) {
if (new_mode != old_mode) {
newattrs.ia_mode = old_mode;
newattrs.ia_valid = ATTR_MODE;
ceph_setattr(dentry, &newattrs);
}
goto out_dput;
}
ceph_set_cached_acl(inode, type, acl);
out_dput:
dput(dentry);
out_free:
kfree(value);
out:
return ret;
}
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
struct ceph_acls_info *info)
{
struct posix_acl *acl, *default_acl;
size_t val_size1 = 0, val_size2 = 0;
struct ceph_pagelist *pagelist = NULL;
void *tmp_buf = NULL;
int err;
err = posix_acl_create(dir, mode, &default_acl, &acl);
if (err)
return err;
if (acl) {
int ret = posix_acl_equiv_mode(acl, mode);
if (ret < 0)
goto out_err;
if (ret == 0) {
posix_acl_release(acl);
acl = NULL;
}
}
if (!default_acl && !acl)
return 0;
if (acl)
val_size1 = posix_acl_xattr_size(acl->a_count);
if (default_acl)
val_size2 = posix_acl_xattr_size(default_acl->a_count);
err = -ENOMEM;
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
if (!tmp_buf)
goto out_err;
pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
if (!pagelist)
goto out_err;
ceph_pagelist_init(pagelist);
err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
if (err)
goto out_err;
ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
if (acl) {
size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
if (err)
goto out_err;
ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
len);
err = posix_acl_to_xattr(&init_user_ns, acl,
tmp_buf, val_size1);
if (err < 0)
goto out_err;
ceph_pagelist_encode_32(pagelist, val_size1);
ceph_pagelist_append(pagelist, tmp_buf, val_size1);
}
if (default_acl) {
size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
if (err)
goto out_err;
err = ceph_pagelist_encode_string(pagelist,
POSIX_ACL_XATTR_DEFAULT, len);
err = posix_acl_to_xattr(&init_user_ns, default_acl,
tmp_buf, val_size2);
if (err < 0)
goto out_err;
ceph_pagelist_encode_32(pagelist, val_size2);
ceph_pagelist_append(pagelist, tmp_buf, val_size2);
}
kfree(tmp_buf);
info->acl = acl;
info->default_acl = default_acl;
info->pagelist = pagelist;
return 0;
out_err:
posix_acl_release(acl);
posix_acl_release(default_acl);
kfree(tmp_buf);
if (pagelist)
ceph_pagelist_release(pagelist);
return err;
}
void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info)
{
if (!inode)
return;
ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl);
ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl);
}
void ceph_release_acls_info(struct ceph_acls_info *info)
{
posix_acl_release(info->acl);
posix_acl_release(info->default_acl);
if (info->pagelist)
ceph_pagelist_release(info->pagelist);
}

1333
fs/ceph/addr.c Normal file

File diff suppressed because it is too large Load diff

402
fs/ceph/cache.c Normal file
View file

@ -0,0 +1,402 @@
/*
* Ceph cache definitions.
*
* Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
* Written by Milosz Tanski (milosz@adfin.com)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to:
* Free Software Foundation
* 51 Franklin Street, Fifth Floor
* Boston, MA 02111-1301 USA
*
*/
#include "super.h"
#include "cache.h"
struct ceph_aux_inode {
struct timespec mtime;
loff_t size;
};
struct fscache_netfs ceph_cache_netfs = {
.name = "ceph",
.version = 0,
};
static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
void *buffer, uint16_t maxbuf)
{
const struct ceph_fs_client* fsc = cookie_netfs_data;
uint16_t klen;
klen = sizeof(fsc->client->fsid);
if (klen > maxbuf)
return 0;
memcpy(buffer, &fsc->client->fsid, klen);
return klen;
}
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
.name = "CEPH.fsid",
.type = FSCACHE_COOKIE_TYPE_INDEX,
.get_key = ceph_fscache_session_get_key,
};
int ceph_fscache_register(void)
{
return fscache_register_netfs(&ceph_cache_netfs);
}
void ceph_fscache_unregister(void)
{
fscache_unregister_netfs(&ceph_cache_netfs);
}
int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
{
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
&ceph_fscache_fsid_object_def,
fsc, true);
if (fsc->fscache == NULL) {
pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
return 0;
}
fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1);
if (fsc->revalidate_wq == NULL)
return -ENOMEM;
return 0;
}
static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
void *buffer, uint16_t maxbuf)
{
const struct ceph_inode_info* ci = cookie_netfs_data;
uint16_t klen;
/* use ceph virtual inode (id + snaphot) */
klen = sizeof(ci->i_vino);
if (klen > maxbuf)
return 0;
memcpy(buffer, &ci->i_vino, klen);
return klen;
}
static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
void *buffer, uint16_t bufmax)
{
struct ceph_aux_inode aux;
const struct ceph_inode_info* ci = cookie_netfs_data;
const struct inode* inode = &ci->vfs_inode;
memset(&aux, 0, sizeof(aux));
aux.mtime = inode->i_mtime;
aux.size = inode->i_size;
memcpy(buffer, &aux, sizeof(aux));
return sizeof(aux);
}
static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
uint64_t *size)
{
const struct ceph_inode_info* ci = cookie_netfs_data;
const struct inode* inode = &ci->vfs_inode;
*size = inode->i_size;
}
static enum fscache_checkaux ceph_fscache_inode_check_aux(
void *cookie_netfs_data, const void *data, uint16_t dlen)
{
struct ceph_aux_inode aux;
struct ceph_inode_info* ci = cookie_netfs_data;
struct inode* inode = &ci->vfs_inode;
if (dlen != sizeof(aux))
return FSCACHE_CHECKAUX_OBSOLETE;
memset(&aux, 0, sizeof(aux));
aux.mtime = inode->i_mtime;
aux.size = inode->i_size;
if (memcmp(data, &aux, sizeof(aux)) != 0)
return FSCACHE_CHECKAUX_OBSOLETE;
dout("ceph inode 0x%p cached okay", ci);
return FSCACHE_CHECKAUX_OKAY;
}
static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
{
struct ceph_inode_info* ci = cookie_netfs_data;
struct pagevec pvec;
pgoff_t first;
int loop, nr_pages;
pagevec_init(&pvec, 0);
first = 0;
dout("ceph inode 0x%p now uncached", ci);
while (1) {
nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
PAGEVEC_SIZE - pagevec_count(&pvec));
if (!nr_pages)
break;
for (loop = 0; loop < nr_pages; loop++)
ClearPageFsCache(pvec.pages[loop]);
first = pvec.pages[nr_pages - 1]->index + 1;
pvec.nr = nr_pages;
pagevec_release(&pvec);
cond_resched();
}
}
static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
.name = "CEPH.inode",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
.get_key = ceph_fscache_inode_get_key,
.get_attr = ceph_fscache_inode_get_attr,
.get_aux = ceph_fscache_inode_get_aux,
.check_aux = ceph_fscache_inode_check_aux,
.now_uncached = ceph_fscache_inode_now_uncached,
};
void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
struct ceph_inode_info* ci)
{
struct inode* inode = &ci->vfs_inode;
/* No caching for filesystem */
if (fsc->fscache == NULL)
return;
/* Only cache for regular files that are read only */
if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
return;
/* Avoid multiple racing open requests */
mutex_lock(&inode->i_mutex);
if (ci->fscache)
goto done;
ci->fscache = fscache_acquire_cookie(fsc->fscache,
&ceph_fscache_inode_object_def,
ci, true);
fscache_check_consistency(ci->fscache);
done:
mutex_unlock(&inode->i_mutex);
}
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
{
struct fscache_cookie* cookie;
if ((cookie = ci->fscache) == NULL)
return;
ci->fscache = NULL;
fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
fscache_relinquish_cookie(cookie, 0);
}
static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
{
if (!error)
SetPageUptodate(page);
}
static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
{
if (!error)
SetPageUptodate(page);
unlock_page(page);
}
static inline int cache_valid(struct ceph_inode_info *ci)
{
return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
(ci->i_fscache_gen == ci->i_rdcache_gen));
}
/* Atempt to read from the fscache,
*
* This function is called from the readpage_nounlock context. DO NOT attempt to
* unlock the page here (or in the callback).
*/
int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int ret;
if (!cache_valid(ci))
return -ENOBUFS;
ret = fscache_read_or_alloc_page(ci->fscache, page,
ceph_vfs_readpage_complete, NULL,
GFP_KERNEL);
switch (ret) {
case 0: /* Page found */
dout("page read submitted\n");
return 0;
case -ENOBUFS: /* Pages were not found, and can't be */
case -ENODATA: /* Pages were not found */
dout("page/inode not in cache\n");
return ret;
default:
dout("%s: unknown error ret = %i\n", __func__, ret);
return ret;
}
}
int ceph_readpages_from_fscache(struct inode *inode,
struct address_space *mapping,
struct list_head *pages,
unsigned *nr_pages)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int ret;
if (!cache_valid(ci))
return -ENOBUFS;
ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
ceph_vfs_readpage_complete_unlock,
NULL, mapping_gfp_mask(mapping));
switch (ret) {
case 0: /* All pages found */
dout("all-page read submitted\n");
return 0;
case -ENOBUFS: /* Some pages were not found, and can't be */
case -ENODATA: /* some pages were not found */
dout("page/inode not in cache\n");
return ret;
default:
dout("%s: unknown error ret = %i\n", __func__, ret);
return ret;
}
}
void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int ret;
if (!PageFsCache(page))
return;
if (!cache_valid(ci))
return;
ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
if (ret)
fscache_uncache_page(ci->fscache, page);
}
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
{
struct ceph_inode_info *ci = ceph_inode(inode);
if (!PageFsCache(page))
return;
fscache_wait_on_page_write(ci->fscache, page);
fscache_uncache_page(ci->fscache, page);
}
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
if (fsc->revalidate_wq)
destroy_workqueue(fsc->revalidate_wq);
fscache_relinquish_cookie(fsc->fscache, 0);
fsc->fscache = NULL;
}
static void ceph_revalidate_work(struct work_struct *work)
{
int issued;
u32 orig_gen;
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_revalidate_work);
struct inode *inode = &ci->vfs_inode;
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
if (!(issued & CEPH_CAP_FILE_CACHE)) {
dout("revalidate_work lost cache before validation %p\n",
inode);
goto out;
}
if (!fscache_check_consistency(ci->fscache))
fscache_invalidate(ci->fscache);
spin_lock(&ci->i_ceph_lock);
/* Update the new valid generation (backwards sanity check too) */
if (orig_gen > ci->i_fscache_gen) {
ci->i_fscache_gen = orig_gen;
}
spin_unlock(&ci->i_ceph_lock);
out:
iput(&ci->vfs_inode);
}
void ceph_queue_revalidate(struct inode *inode)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
if (fsc->revalidate_wq == NULL || ci->fscache == NULL)
return;
ihold(inode);
if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq,
&ci->i_revalidate_work)) {
dout("ceph_queue_revalidate %p\n", inode);
} else {
dout("ceph_queue_revalidate %p failed\n)", inode);
iput(inode);
}
}
void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
ci->fscache = NULL;
/* The first load is verifed cookie open time */
ci->i_fscache_gen = 1;
INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work);
}

182
fs/ceph/cache.h Normal file
View file

@ -0,0 +1,182 @@
/*
* Ceph cache definitions.
*
* Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
* Written by Milosz Tanski (milosz@adfin.com)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to:
* Free Software Foundation
* 51 Franklin Street, Fifth Floor
* Boston, MA 02111-1301 USA
*
*/
#ifndef _CEPH_CACHE_H
#define _CEPH_CACHE_H
#ifdef CONFIG_CEPH_FSCACHE
extern struct fscache_netfs ceph_cache_netfs;
int ceph_fscache_register(void);
void ceph_fscache_unregister(void);
int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
void ceph_fscache_inode_init(struct ceph_inode_info *ci);
void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
struct ceph_inode_info* ci);
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
int ceph_readpages_from_fscache(struct inode *inode,
struct address_space *mapping,
struct list_head *pages,
unsigned *nr_pages);
void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
void ceph_queue_revalidate(struct inode *inode);
static inline void ceph_fscache_update_objectsize(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
fscache_attr_changed(ci->fscache);
}
static inline void ceph_fscache_invalidate(struct inode *inode)
{
fscache_invalidate(ceph_inode(inode)->fscache);
}
static inline void ceph_fscache_uncache_page(struct inode *inode,
struct page *page)
{
struct ceph_inode_info *ci = ceph_inode(inode);
return fscache_uncache_page(ci->fscache, page);
}
static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
{
struct inode* inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
return fscache_maybe_release_page(ci->fscache, page, gfp);
}
static inline void ceph_fscache_readpage_cancel(struct inode *inode,
struct page *page)
{
struct ceph_inode_info *ci = ceph_inode(inode);
if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
__fscache_uncache_page(ci->fscache, page);
}
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
struct list_head *pages)
{
struct ceph_inode_info *ci = ceph_inode(inode);
return fscache_readpages_cancel(ci->fscache, pages);
}
#else
static inline int ceph_fscache_register(void)
{
return 0;
}
static inline void ceph_fscache_unregister(void)
{
}
static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
{
return 0;
}
static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
}
static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
}
static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
struct ceph_inode_info* ci)
{
}
static inline void ceph_fscache_uncache_page(struct inode *inode,
struct page *pages)
{
}
static inline int ceph_readpage_from_fscache(struct inode* inode,
struct page *page)
{
return -ENOBUFS;
}
static inline int ceph_readpages_from_fscache(struct inode *inode,
struct address_space *mapping,
struct list_head *pages,
unsigned *nr_pages)
{
return -ENOBUFS;
}
static inline void ceph_readpage_to_fscache(struct inode *inode,
struct page *page)
{
}
static inline void ceph_fscache_update_objectsize(struct inode *inode)
{
}
static inline void ceph_fscache_invalidate(struct inode *inode)
{
}
static inline void ceph_invalidate_fscache_page(struct inode *inode,
struct page *page)
{
}
static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
{
}
static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
{
return 1;
}
static inline void ceph_fscache_readpage_cancel(struct inode *inode,
struct page *page)
{
}
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
struct list_head *pages)
{
}
static inline void ceph_queue_revalidate(struct inode *inode)
{
}
#endif
#endif

3332
fs/ceph/caps.c Normal file

File diff suppressed because it is too large Load diff

22
fs/ceph/ceph_frag.c Normal file
View file

@ -0,0 +1,22 @@
/*
* Ceph 'frag' type
*/
#include <linux/module.h>
#include <linux/ceph/types.h>
int ceph_frag_compare(__u32 a, __u32 b)
{
unsigned va = ceph_frag_value(a);
unsigned vb = ceph_frag_value(b);
if (va < vb)
return -1;
if (va > vb)
return 1;
va = ceph_frag_bits(a);
vb = ceph_frag_bits(b);
if (va < vb)
return -1;
if (va > vb)
return 1;
return 0;
}

323
fs/ceph/debugfs.c Normal file
View file

@ -0,0 +1,323 @@
#include <linux/ceph/ceph_debug.h>
#include <linux/device.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/mon_client.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
#include "super.h"
#ifdef CONFIG_DEBUG_FS
#include "mds_client.h"
static int mdsmap_show(struct seq_file *s, void *p)
{
int i;
struct ceph_fs_client *fsc = s->private;
if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
return 0;
seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
seq_printf(s, "session_timeout %d\n",
fsc->mdsc->mdsmap->m_session_timeout);
seq_printf(s, "session_autoclose %d\n",
fsc->mdsc->mdsmap->m_session_autoclose);
for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
struct ceph_entity_addr *addr =
&fsc->mdsc->mdsmap->m_info[i].addr;
int state = fsc->mdsc->mdsmap->m_info[i].state;
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
ceph_pr_addr(&addr->in_addr),
ceph_mds_state_name(state));
}
return 0;
}
/*
* mdsc debugfs
*/
static int mdsc_show(struct seq_file *s, void *p)
{
struct ceph_fs_client *fsc = s->private;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
int pathlen;
u64 pathbase;
char *path;
mutex_lock(&mdsc->mutex);
for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
req = rb_entry(rp, struct ceph_mds_request, r_node);
if (req->r_request && req->r_session)
seq_printf(s, "%lld\tmds%d\t", req->r_tid,
req->r_session->s_mds);
else if (!req->r_request)
seq_printf(s, "%lld\t(no request)\t", req->r_tid);
else
seq_printf(s, "%lld\t(no session)\t", req->r_tid);
seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
if (req->r_got_unsafe)
seq_puts(s, "\t(unsafe)");
else
seq_puts(s, "\t");
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
} else if (req->r_dentry) {
path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&pathbase, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_dentry->d_lock);
seq_printf(s, " #%llx/%.*s (%s)",
ceph_ino(req->r_dentry->d_parent->d_inode),
req->r_dentry->d_name.len,
req->r_dentry->d_name.name,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
kfree(path);
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
} else {
seq_printf(s, " #%llx", req->r_ino1.ino);
}
if (req->r_old_dentry) {
path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
&pathbase, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
seq_printf(s, " #%llx/%.*s (%s)",
req->r_old_dentry_dir ?
ceph_ino(req->r_old_dentry_dir) : 0,
req->r_old_dentry->d_name.len,
req->r_old_dentry->d_name.name,
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
kfree(path);
} else if (req->r_path2) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
req->r_path2);
else
seq_printf(s, " %s", req->r_path2);
}
seq_puts(s, "\n");
}
mutex_unlock(&mdsc->mutex);
return 0;
}
static int caps_show(struct seq_file *s, void *p)
{
struct ceph_fs_client *fsc = s->private;
int total, avail, used, reserved, min;
ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
seq_printf(s, "total\t\t%d\n"
"avail\t\t%d\n"
"used\t\t%d\n"
"reserved\t%d\n"
"min\t%d\n",
total, avail, used, reserved, min);
return 0;
}
static int dentry_lru_show(struct seq_file *s, void *ptr)
{
struct ceph_fs_client *fsc = s->private;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_dentry_info *di;
spin_lock(&mdsc->dentry_lru_lock);
list_for_each_entry(di, &mdsc->dentry_lru, lru) {
struct dentry *dentry = di->dentry;
seq_printf(s, "%p %p\t%.*s\n",
di, dentry, dentry->d_name.len, dentry->d_name.name);
}
spin_unlock(&mdsc->dentry_lru_lock);
return 0;
}
static int mds_sessions_show(struct seq_file *s, void *ptr)
{
struct ceph_fs_client *fsc = s->private;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_auth_client *ac = fsc->client->monc.auth;
struct ceph_options *opt = fsc->client->options;
int mds = -1;
mutex_lock(&mdsc->mutex);
/* The 'num' portion of an 'entity name' */
seq_printf(s, "global_id %llu\n", ac->global_id);
/* The -o name mount argument */
seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : "");
/* The list of MDS session rank+state */
for (mds = 0; mds < mdsc->max_sessions; mds++) {
struct ceph_mds_session *session =
__ceph_lookup_mds_session(mdsc, mds);
if (!session) {
continue;
}
mutex_unlock(&mdsc->mutex);
seq_printf(s, "mds.%d %s\n",
session->s_mds,
ceph_session_state_name(session->s_state));
ceph_put_mds_session(session);
mutex_lock(&mdsc->mutex);
}
mutex_unlock(&mdsc->mutex);
return 0;
}
CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
CEPH_DEFINE_SHOW_FUNC(mdsc_show)
CEPH_DEFINE_SHOW_FUNC(caps_show)
CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
/*
* debugfs
*/
static int congestion_kb_set(void *data, u64 val)
{
struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
fsc->mount_options->congestion_kb = (int)val;
return 0;
}
static int congestion_kb_get(void *data, u64 *val)
{
struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
*val = (u64)fsc->mount_options->congestion_kb;
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
congestion_kb_set, "%llu\n");
void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
{
dout("ceph_fs_debugfs_cleanup\n");
debugfs_remove(fsc->debugfs_bdi);
debugfs_remove(fsc->debugfs_congestion_kb);
debugfs_remove(fsc->debugfs_mdsmap);
debugfs_remove(fsc->debugfs_mds_sessions);
debugfs_remove(fsc->debugfs_caps);
debugfs_remove(fsc->debugfs_mdsc);
debugfs_remove(fsc->debugfs_dentry_lru);
}
int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
char name[100];
int err = -ENOMEM;
dout("ceph_fs_debugfs_init\n");
BUG_ON(!fsc->client->debugfs_dir);
fsc->debugfs_congestion_kb =
debugfs_create_file("writeback_congestion_kb",
0600,
fsc->client->debugfs_dir,
fsc,
&congestion_kb_fops);
if (!fsc->debugfs_congestion_kb)
goto out;
snprintf(name, sizeof(name), "../../bdi/%s",
dev_name(fsc->backing_dev_info.dev));
fsc->debugfs_bdi =
debugfs_create_symlink("bdi",
fsc->client->debugfs_dir,
name);
if (!fsc->debugfs_bdi)
goto out;
fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
0600,
fsc->client->debugfs_dir,
fsc,
&mdsmap_show_fops);
if (!fsc->debugfs_mdsmap)
goto out;
fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
0600,
fsc->client->debugfs_dir,
fsc,
&mds_sessions_show_fops);
if (!fsc->debugfs_mds_sessions)
goto out;
fsc->debugfs_mdsc = debugfs_create_file("mdsc",
0600,
fsc->client->debugfs_dir,
fsc,
&mdsc_show_fops);
if (!fsc->debugfs_mdsc)
goto out;
fsc->debugfs_caps = debugfs_create_file("caps",
0400,
fsc->client->debugfs_dir,
fsc,
&caps_show_fops);
if (!fsc->debugfs_caps)
goto out;
fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
0600,
fsc->client->debugfs_dir,
fsc,
&dentry_lru_show_fops);
if (!fsc->debugfs_dentry_lru)
goto out;
return 0;
out:
ceph_fs_debugfs_cleanup(fsc);
return err;
}
#else /* CONFIG_DEBUG_FS */
int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
return 0;
}
void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
{
}
#endif /* CONFIG_DEBUG_FS */

1369
fs/ceph/dir.c Normal file

File diff suppressed because it is too large Load diff

250
fs/ceph/export.c Normal file
View file

@ -0,0 +1,250 @@
#include <linux/ceph/ceph_debug.h>
#include <linux/exportfs.h>
#include <linux/slab.h>
#include <asm/unaligned.h>
#include "super.h"
#include "mds_client.h"
/*
* Basic fh
*/
struct ceph_nfs_fh {
u64 ino;
} __attribute__ ((packed));
/*
* Larger fh that includes parent ino.
*/
struct ceph_nfs_confh {
u64 ino, parent_ino;
} __attribute__ ((packed));
static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
int type;
struct ceph_nfs_fh *fh = (void *)rawfh;
struct ceph_nfs_confh *cfh = (void *)rawfh;
int connected_handle_length = sizeof(*cfh)/4;
int handle_length = sizeof(*fh)/4;
/* don't re-export snaps */
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EINVAL;
if (parent_inode && (*max_len < connected_handle_length)) {
*max_len = connected_handle_length;
return FILEID_INVALID;
} else if (*max_len < handle_length) {
*max_len = handle_length;
return FILEID_INVALID;
}
if (parent_inode) {
dout("encode_fh %llx with parent %llx\n",
ceph_ino(inode), ceph_ino(parent_inode));
cfh->ino = ceph_ino(inode);
cfh->parent_ino = ceph_ino(parent_inode);
*max_len = connected_handle_length;
type = FILEID_INO32_GEN_PARENT;
} else {
dout("encode_fh %llx\n", ceph_ino(inode));
fh->ino = ceph_ino(inode);
*max_len = handle_length;
type = FILEID_INO32_GEN;
}
return type;
}
static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
struct dentry *dentry;
struct ceph_vino vino;
int err;
vino.ino = ino;
vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
if (!inode) {
struct ceph_mds_request *req;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
USE_ANY_MDS);
if (IS_ERR(req))
return ERR_CAST(req);
req->r_ino1 = vino;
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
inode = req->r_target_inode;
if (inode)
ihold(inode);
ceph_mdsc_put_request(req);
if (!inode)
return ERR_PTR(-ESTALE);
}
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
iput(inode);
return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
dput(dentry);
return ERR_PTR(err);
}
dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
return dentry;
}
/*
* convert regular fh to dentry
*/
static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
struct fid *fid,
int fh_len, int fh_type)
{
struct ceph_nfs_fh *fh = (void *)fid->raw;
if (fh_type != FILEID_INO32_GEN &&
fh_type != FILEID_INO32_GEN_PARENT)
return NULL;
if (fh_len < sizeof(*fh) / 4)
return NULL;
dout("fh_to_dentry %llx\n", fh->ino);
return __fh_to_dentry(sb, fh->ino);
}
static struct dentry *__get_parent(struct super_block *sb,
struct dentry *child, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct ceph_mds_request *req;
struct inode *inode;
struct dentry *dentry;
int err;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
USE_ANY_MDS);
if (IS_ERR(req))
return ERR_CAST(req);
if (child) {
req->r_inode = child->d_inode;
ihold(child->d_inode);
} else {
req->r_ino1 = (struct ceph_vino) {
.ino = ino,
.snap = CEPH_NOSNAP,
};
}
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
inode = req->r_target_inode;
if (inode)
ihold(inode);
ceph_mdsc_put_request(req);
if (!inode)
return ERR_PTR(-ENOENT);
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
iput(inode);
return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
dput(dentry);
return ERR_PTR(err);
}
dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
child ? ceph_ino(child->d_inode) : ino,
dentry, ceph_vinop(inode));
return dentry;
}
static struct dentry *ceph_get_parent(struct dentry *child)
{
/* don't re-export snaps */
if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
return ERR_PTR(-EINVAL);
dout("get_parent %p ino %llx.%llx\n",
child, ceph_vinop(child->d_inode));
return __get_parent(child->d_sb, child, 0);
}
/*
* convert regular fh to parent
*/
static struct dentry *ceph_fh_to_parent(struct super_block *sb,
struct fid *fid,
int fh_len, int fh_type)
{
struct ceph_nfs_confh *cfh = (void *)fid->raw;
struct dentry *dentry;
if (fh_type != FILEID_INO32_GEN_PARENT)
return NULL;
if (fh_len < sizeof(*cfh) / 4)
return NULL;
dout("fh_to_parent %llx\n", cfh->parent_ino);
dentry = __get_parent(sb, NULL, cfh->ino);
if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
dentry = __fh_to_dentry(sb, cfh->parent_ino);
return dentry;
}
static int ceph_get_name(struct dentry *parent, char *name,
struct dentry *child)
{
struct ceph_mds_client *mdsc;
struct ceph_mds_request *req;
int err;
mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
USE_ANY_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
mutex_lock(&parent->d_inode->i_mutex);
req->r_inode = child->d_inode;
ihold(child->d_inode);
req->r_ino2 = ceph_vino(parent->d_inode);
req->r_locked_dir = parent->d_inode;
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
mutex_unlock(&parent->d_inode->i_mutex);
if (!err) {
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
memcpy(name, rinfo->dname, rinfo->dname_len);
name[rinfo->dname_len] = 0;
dout("get_name %p ino %llx.%llx name %s\n",
child, ceph_vinop(child->d_inode), name);
} else {
dout("get_name %p ino %llx.%llx err %d\n",
child, ceph_vinop(child->d_inode), err);
}
ceph_mdsc_put_request(req);
return err;
}
const struct export_operations ceph_export_ops = {
.encode_fh = ceph_encode_fh,
.fh_to_dentry = ceph_fh_to_dentry,
.fh_to_parent = ceph_fh_to_parent,
.get_parent = ceph_get_parent,
.get_name = ceph_get_name,
};

1273
fs/ceph/file.c Normal file

File diff suppressed because it is too large Load diff

1982
fs/ceph/inode.c Normal file

File diff suppressed because it is too large Load diff

295
fs/ceph/ioctl.c Normal file
View file

@ -0,0 +1,295 @@
#include <linux/ceph/ceph_debug.h>
#include <linux/in.h>
#include "super.h"
#include "mds_client.h"
#include "ioctl.h"
/*
* ioctls
*/
/*
* get and set the file layout
*/
static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
{
struct ceph_inode_info *ci = ceph_inode(file_inode(file));
struct ceph_ioctl_layout l;
int err;
err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
if (!err) {
l.stripe_unit = ceph_file_layout_su(ci->i_layout);
l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
l.object_size = ceph_file_layout_object_size(ci->i_layout);
l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
l.preferred_osd = (s32)-1;
if (copy_to_user(arg, &l, sizeof(l)))
return -EFAULT;
}
return err;
}
static long __validate_layout(struct ceph_mds_client *mdsc,
struct ceph_ioctl_layout *l)
{
int i, err;
/* validate striping parameters */
if ((l->object_size & ~PAGE_MASK) ||
(l->stripe_unit & ~PAGE_MASK) ||
((unsigned)l->stripe_unit != 0 &&
((unsigned)l->object_size % (unsigned)l->stripe_unit)))
return -EINVAL;
/* make sure it's a valid data pool */
mutex_lock(&mdsc->mutex);
err = -EINVAL;
for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) {
err = 0;
break;
}
mutex_unlock(&mdsc->mutex);
if (err)
return err;
return 0;
}
static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
struct ceph_inode_info *ci = ceph_inode(file_inode(file));
struct ceph_ioctl_layout nl;
int err;
if (copy_from_user(&l, arg, sizeof(l)))
return -EFAULT;
/* validate changed params against current layout */
err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
if (err)
return err;
memset(&nl, 0, sizeof(nl));
if (l.stripe_count)
nl.stripe_count = l.stripe_count;
else
nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
if (l.stripe_unit)
nl.stripe_unit = l.stripe_unit;
else
nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
if (l.object_size)
nl.object_size = l.object_size;
else
nl.object_size = ceph_file_layout_object_size(ci->i_layout);
if (l.data_pool)
nl.data_pool = l.data_pool;
else
nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
/* this is obsolete, and always -1 */
nl.preferred_osd = le64_to_cpu(-1);
err = __validate_layout(mdsc, &nl);
if (err)
return err;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
req->r_num_caps = 1;
req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
req->r_args.setlayout.layout.fl_stripe_unit =
cpu_to_le32(l.stripe_unit);
req->r_args.setlayout.layout.fl_stripe_count =
cpu_to_le32(l.stripe_count);
req->r_args.setlayout.layout.fl_object_size =
cpu_to_le32(l.object_size);
req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
return err;
}
/*
* Set a layout policy on a directory inode. All items in the tree
* rooted at this inode will inherit this layout on creation,
* (It doesn't apply retroactively )
* unless a subdirectory has its own layout policy.
*/
static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
int err;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
/* copy and validate */
if (copy_from_user(&l, arg, sizeof(l)))
return -EFAULT;
err = __validate_layout(mdsc, &l);
if (err)
return err;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
req->r_num_caps = 1;
req->r_args.setlayout.layout.fl_stripe_unit =
cpu_to_le32(l.stripe_unit);
req->r_args.setlayout.layout.fl_stripe_count =
cpu_to_le32(l.stripe_count);
req->r_args.setlayout.layout.fl_object_size =
cpu_to_le32(l.object_size);
req->r_args.setlayout.layout.fl_pg_pool =
cpu_to_le32(l.data_pool);
err = ceph_mdsc_do_request(mdsc, inode, req);
ceph_mdsc_put_request(req);
return err;
}
/*
* Return object name, size/offset information, and location (OSD
* number, network address) for a given file offset.
*/
static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
{
struct ceph_ioctl_dataloc dl;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
struct ceph_object_locator oloc;
struct ceph_object_id oid;
u64 len = 1, olen;
u64 tmp;
struct ceph_pg pgid;
int r;
/* copy and validate */
if (copy_from_user(&dl, arg, sizeof(dl)))
return -EFAULT;
down_read(&osdc->map_sem);
r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
&dl.object_no, &dl.object_offset,
&olen);
if (r < 0) {
up_read(&osdc->map_sem);
return -EIO;
}
dl.file_offset -= dl.object_offset;
dl.object_size = ceph_file_layout_object_size(ci->i_layout);
dl.block_size = ceph_file_layout_su(ci->i_layout);
/* block_offset = object_offset % block_size */
tmp = dl.object_offset;
dl.block_offset = do_div(tmp, dl.block_size);
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
ceph_oid_set_name(&oid, dl.object_name);
r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
if (r < 0) {
up_read(&osdc->map_sem);
return r;
}
dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
if (dl.osd >= 0) {
struct ceph_entity_addr *a =
ceph_osd_addr(osdc->osdmap, dl.osd);
if (a)
memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
} else {
memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
}
up_read(&osdc->map_sem);
/* send result back to user */
if (copy_to_user(arg, &dl, sizeof(dl)))
return -EFAULT;
return 0;
}
static long ceph_ioctl_lazyio(struct file *file)
{
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
ci->i_nr_by_mode[fi->fmode]--;
fi->fmode |= CEPH_FILE_MODE_LAZY;
ci->i_nr_by_mode[fi->fmode]++;
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
ceph_check_caps(ci, 0, NULL);
} else {
dout("ioctl_layzio: file %p already lazy\n", file);
}
return 0;
}
static long ceph_ioctl_syncio(struct file *file)
{
struct ceph_file_info *fi = file->private_data;
fi->flags |= CEPH_F_SYNC;
return 0;
}
long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
switch (cmd) {
case CEPH_IOC_GET_LAYOUT:
return ceph_ioctl_get_layout(file, (void __user *)arg);
case CEPH_IOC_SET_LAYOUT:
return ceph_ioctl_set_layout(file, (void __user *)arg);
case CEPH_IOC_SET_LAYOUT_POLICY:
return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
case CEPH_IOC_GET_DATALOC:
return ceph_ioctl_get_dataloc(file, (void __user *)arg);
case CEPH_IOC_LAZYIO:
return ceph_ioctl_lazyio(file);
case CEPH_IOC_SYNCIO:
return ceph_ioctl_syncio(file);
}
return -ENOTTY;
}

100
fs/ceph/ioctl.h Normal file
View file

@ -0,0 +1,100 @@
#ifndef FS_CEPH_IOCTL_H
#define FS_CEPH_IOCTL_H
#include <linux/ioctl.h>
#include <linux/types.h>
#define CEPH_IOCTL_MAGIC 0x97
/*
* CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy
* CEPH_IOC_SET_LAYOUT - set file layout
* CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy
*
* The file layout specifies how file data is striped over objects in
* the distributed object store, which object pool they belong to (if
* it differs from the default), and an optional 'preferred osd' to
* store them on.
*
* Files get a new layout based on the policy set on the containing
* directory or one of its ancestors. The GET_LAYOUT ioctl will let
* you examine the layout for a file or the policy on a directory.
*
* SET_LAYOUT will let you set a layout on a newly created file. This
* only works immediately after the file is created and before any
* data is written to it.
*
* SET_LAYOUT_POLICY will let you set a layout policy (default layout)
* on a directory that will apply to any new files created in that
* directory (or any child directory that doesn't specify a layout of
* its own).
*/
/* use u64 to align sanely on all archs */
struct ceph_ioctl_layout {
__u64 stripe_unit, stripe_count, object_size;
__u64 data_pool;
/* obsolete. new values ignored, always return -1 */
__s64 preferred_osd;
};
#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
struct ceph_ioctl_layout)
#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
struct ceph_ioctl_layout)
#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
struct ceph_ioctl_layout)
/*
* CEPH_IOC_GET_DATALOC - get location of file data in the cluster
*
* Extract identity, address of the OSD and object storing a given
* file offset.
*/
struct ceph_ioctl_dataloc {
__u64 file_offset; /* in+out: file offset */
__u64 object_offset; /* out: offset in object */
__u64 object_no; /* out: object # */
__u64 object_size; /* out: object size */
char object_name[64]; /* out: object name */
__u64 block_offset; /* out: offset in block */
__u64 block_size; /* out: block length */
__s64 osd; /* out: osd # */
struct sockaddr_storage osd_addr; /* out: osd address */
};
#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
struct ceph_ioctl_dataloc)
/*
* CEPH_IOC_LAZYIO - relax consistency
*
* Normally Ceph switches to synchronous IO when multiple clients have
* the file open (and or more for write). Reads and writes bypass the
* page cache and go directly to the OSD. Setting this flag on a file
* descriptor will allow buffered IO for this file in cases where the
* application knows it won't interfere with other nodes (or doesn't
* care).
*/
#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
/*
* CEPH_IOC_SYNCIO - force synchronous IO
*
* This ioctl sets a file flag that forces the synchronous IO that
* bypasses the page cache, even if it is not necessary. This is
* essentially the opposite behavior of IOC_LAZYIO. This forces the
* same read/write path as a file opened by multiple clients when one
* or more of those clients is opened for write.
*
* Note that this type of sync IO takes a different path than a file
* opened with O_SYNC/D_SYNC (writes hit the page cache and are
* immediately flushed on page boundaries). It is very similar to
* O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes
* are not copied (user page must remain stable) and O_DIRECT writes
* have alignment restrictions (on the buffer and file offset).
*/
#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
#endif

332
fs/ceph/locks.c Normal file
View file

@ -0,0 +1,332 @@
#include <linux/ceph/ceph_debug.h>
#include <linux/file.h>
#include <linux/namei.h>
#include <linux/random.h>
#include "super.h"
#include "mds_client.h"
#include <linux/ceph/pagelist.h>
static u64 lock_secret;
static inline u64 secure_addr(void *addr)
{
u64 v = lock_secret ^ (u64)(unsigned long)addr;
/*
* Set the most significant bit, so that MDS knows the 'owner'
* is sufficient to identify the owner of lock. (old code uses
* both 'owner' and 'pid')
*/
v |= (1ULL << 63);
return v;
}
void __init ceph_flock_init(void)
{
get_random_bytes(&lock_secret, sizeof(lock_secret));
}
/**
* Implement fcntl and flock locking functions.
*/
static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
int cmd, u8 wait, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
int err;
u64 length = 0;
u64 owner;
req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
req->r_num_caps = 1;
/* mds requires start and length rather than start and end */
if (LLONG_MAX == fl->fl_end)
length = 0;
else
length = fl->fl_end - fl->fl_start + 1;
owner = secure_addr(fl->fl_owner);
dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
"start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
(int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
wait, fl->fl_type);
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
req->r_args.filelock_change.owner = cpu_to_le64(owner);
req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
err = ceph_mdsc_do_request(mdsc, inode, req);
if (operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
fl->fl_type = F_RDLCK;
else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
fl->fl_type = F_WRLCK;
else
fl->fl_type = F_UNLCK;
fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
le64_to_cpu(req->r_reply_info.filelock_reply->length);
if (length >= 1)
fl->fl_end = length -1;
else
fl->fl_end = 0;
}
ceph_mdsc_put_request(req);
dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
"length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
(int)operation, (u64)fl->fl_pid, fl->fl_start,
length, wait, fl->fl_type, err);
return err;
}
/**
* Attempt to set an fcntl lock.
* For now, this just goes away to the server. Later it may be more awesome.
*/
int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
{
u8 lock_cmd;
int err;
u8 wait = 0;
u16 op = CEPH_MDS_OP_SETFILELOCK;
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
/* No mandatory locks */
if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
return -ENOLCK;
dout("ceph_lock, fl_owner: %p", fl->fl_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
if (IS_GETLK(cmd))
op = CEPH_MDS_OP_GETFILELOCK;
else if (IS_SETLKW(cmd))
wait = 1;
if (F_RDLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_SHARED;
else if (F_WRLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
if (!err) {
if (op != CEPH_MDS_OP_GETFILELOCK) {
dout("mds locked, locking locally");
err = posix_lock_file(file, fl, NULL);
if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
/* undo! This should only happen if
* the kernel detects local
* deadlock. */
ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
CEPH_LOCK_UNLOCK, 0, fl);
dout("got %d on posix_lock_file, undid lock",
err);
}
}
} else if (err == -ERESTARTSYS) {
dout("undoing lock\n");
ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
CEPH_LOCK_UNLOCK, 0, fl);
}
return err;
}
int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
u8 lock_cmd;
int err;
u8 wait = 0;
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
/* No mandatory locks */
if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
return -ENOLCK;
dout("ceph_flock, fl_file: %p", fl->fl_file);
if (IS_SETLKW(cmd))
wait = 1;
if (F_RDLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_SHARED;
else if (F_WRLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
file, lock_cmd, wait, fl);
if (!err) {
err = flock_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
file, CEPH_LOCK_UNLOCK, 0, fl);
dout("got %d on flock_lock_file_wait, undid lock", err);
}
} else if (err == -ERESTARTSYS) {
dout("undoing lock\n");
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
file, CEPH_LOCK_UNLOCK, 0, fl);
}
return err;
}
/**
* Must be called with lock_flocks() already held. Fills in the passed
* counter variables, so you can prepare pagelist metadata before calling
* ceph_encode_locks.
*/
void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
{
struct file_lock *lock;
*fcntl_count = 0;
*flock_count = 0;
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_POSIX)
++(*fcntl_count);
else if (lock->fl_flags & FL_FLOCK)
++(*flock_count);
}
dout("counted %d flock locks and %d fcntl locks",
*flock_count, *fcntl_count);
}
/**
* Encode the flock and fcntl locks for the given inode into the ceph_filelock
* array. Must be called with inode->i_lock already held.
* If we encounter more of a specific lock type than expected, return -ENOSPC.
*/
int ceph_encode_locks_to_buffer(struct inode *inode,
struct ceph_filelock *flocks,
int num_fcntl_locks, int num_flock_locks)
{
struct file_lock *lock;
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
int l = 0;
dout("encoding %d flock and %d fcntl locks", num_flock_locks,
num_fcntl_locks);
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_POSIX) {
++seen_fcntl;
if (seen_fcntl > num_fcntl_locks) {
err = -ENOSPC;
goto fail;
}
err = lock_to_ceph_filelock(lock, &flocks[l]);
if (err)
goto fail;
++l;
}
}
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_FLOCK) {
++seen_flock;
if (seen_flock > num_flock_locks) {
err = -ENOSPC;
goto fail;
}
err = lock_to_ceph_filelock(lock, &flocks[l]);
if (err)
goto fail;
++l;
}
}
fail:
return err;
}
/**
* Copy the encoded flock and fcntl locks into the pagelist.
* Format is: #fcntl locks, sequential fcntl locks, #flock locks,
* sequential flock locks.
* Returns zero on success.
*/
int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
struct ceph_pagelist *pagelist,
int num_fcntl_locks, int num_flock_locks)
{
int err = 0;
__le32 nlocks;
nlocks = cpu_to_le32(num_fcntl_locks);
err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
if (err)
goto out_fail;
err = ceph_pagelist_append(pagelist, flocks,
num_fcntl_locks * sizeof(*flocks));
if (err)
goto out_fail;
nlocks = cpu_to_le32(num_flock_locks);
err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
if (err)
goto out_fail;
err = ceph_pagelist_append(pagelist,
&flocks[num_fcntl_locks],
num_flock_locks * sizeof(*flocks));
out_fail:
return err;
}
/*
* Given a pointer to a lock, convert it to a ceph filelock
*/
int lock_to_ceph_filelock(struct file_lock *lock,
struct ceph_filelock *cephlock)
{
int err = 0;
cephlock->start = cpu_to_le64(lock->fl_start);
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
cephlock->client = cpu_to_le64(0);
cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
switch (lock->fl_type) {
case F_RDLCK:
cephlock->type = CEPH_LOCK_SHARED;
break;
case F_WRLCK:
cephlock->type = CEPH_LOCK_EXCL;
break;
case F_UNLCK:
cephlock->type = CEPH_LOCK_UNLOCK;
break;
default:
dout("Have unknown lock type %d", lock->fl_type);
err = -EINVAL;
}
return err;
}

3758
fs/ceph/mds_client.c Normal file

File diff suppressed because it is too large Load diff

394
fs/ceph/mds_client.h Normal file
View file

@ -0,0 +1,394 @@
#ifndef _FS_CEPH_MDS_CLIENT_H
#define _FS_CEPH_MDS_CLIENT_H
#include <linux/completion.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/ceph/types.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/mdsmap.h>
#include <linux/ceph/auth.h>
/*
* Some lock dependencies:
*
* session->s_mutex
* mdsc->mutex
*
* mdsc->snap_rwsem
*
* ci->i_ceph_lock
* mdsc->snap_flush_lock
* mdsc->cap_delay_lock
*
*/
struct ceph_fs_client;
struct ceph_cap;
/*
* parsed info about a single inode. pointers are into the encoded
* on-wire structures within the mds reply message payload.
*/
struct ceph_mds_reply_info_in {
struct ceph_mds_reply_inode *in;
struct ceph_dir_layout dir_layout;
u32 symlink_len;
char *symlink;
u32 xattr_len;
char *xattr_data;
};
/*
* parsed info about an mds reply, including information about
* either: 1) the target inode and/or its parent directory and dentry,
* and directory contents (for readdir results), or
* 2) the file range lock info (for fcntl F_GETLK results).
*/
struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_head *head;
/* trace */
struct ceph_mds_reply_info_in diri, targeti;
struct ceph_mds_reply_dirfrag *dirfrag;
char *dname;
u32 dname_len;
struct ceph_mds_reply_lease *dlease;
/* extra */
union {
/* for fcntl F_GETLK results */
struct ceph_filelock *filelock_reply;
/* for readdir results */
struct {
struct ceph_mds_reply_dirfrag *dir_dir;
size_t dir_buf_size;
int dir_nr;
char **dir_dname;
u32 *dir_dname_len;
struct ceph_mds_reply_lease **dir_dlease;
struct ceph_mds_reply_info_in *dir_in;
u8 dir_complete, dir_end;
};
/* for create results */
struct {
bool has_create_ino;
u64 ino;
};
};
/* encoded blob describing snapshot contexts for certain
operations (e.g., open) */
void *snapblob;
int snapblob_len;
};
/*
* cap releases are batched and sent to the MDS en masse.
*/
#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
sizeof(struct ceph_mds_cap_release)) / \
sizeof(struct ceph_mds_cap_item))
/*
* state associated with each MDS<->client session
*/
enum {
CEPH_MDS_SESSION_NEW = 1,
CEPH_MDS_SESSION_OPENING = 2,
CEPH_MDS_SESSION_OPEN = 3,
CEPH_MDS_SESSION_HUNG = 4,
CEPH_MDS_SESSION_CLOSING = 5,
CEPH_MDS_SESSION_RESTARTING = 6,
CEPH_MDS_SESSION_RECONNECTING = 7,
};
struct ceph_mds_session {
struct ceph_mds_client *s_mdsc;
int s_mds;
int s_state;
unsigned long s_ttl; /* time until mds kills us */
u64 s_seq; /* incoming msg seq # */
struct mutex s_mutex; /* serialize session messages */
struct ceph_connection s_con;
struct ceph_auth_handshake s_auth;
/* protected by s_gen_ttl_lock */
spinlock_t s_gen_ttl_lock;
u32 s_cap_gen; /* inc each time we get mds stale msg */
unsigned long s_cap_ttl; /* when session caps expire */
/* protected by s_cap_lock */
spinlock_t s_cap_lock;
struct list_head s_caps; /* all caps issued by this session */
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
int s_cap_reconnect;
struct list_head s_cap_releases; /* waiting cap_release messages */
struct list_head s_cap_releases_done; /* ready to send */
struct ceph_cap *s_cap_iterator;
/* protected by mutex */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
struct list_head s_cap_snaps_flushing;
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
atomic_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
};
/*
* modes of choosing which MDS to send a request to
*/
enum {
USE_ANY_MDS,
USE_RANDOM_MDS,
USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
};
struct ceph_mds_request;
struct ceph_mds_client;
/*
* request completion callback
*/
typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
struct ceph_mds_request *req);
/*
* an in-flight mds request
*/
struct ceph_mds_request {
u64 r_tid; /* transaction id */
struct rb_node r_node;
struct ceph_mds_client *r_mdsc;
int r_op; /* mds op code */
/* operation on what? */
struct inode *r_inode; /* arg1 */
struct dentry *r_dentry; /* arg1 */
struct dentry *r_old_dentry; /* arg2: rename from or link from */
struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */
char *r_path1, *r_path2;
struct ceph_vino r_ino1, r_ino2;
struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
struct inode *r_target_inode; /* resulting inode */
struct mutex r_fill_mutex;
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
kuid_t r_uid;
kgid_t r_gid;
struct timespec r_stamp;
/* for choosing which mds to send this request to */
int r_direct_mode;
u32 r_direct_hash; /* choose dir frag based on this dentry hash */
bool r_direct_is_hash; /* true if r_direct_hash is valid */
/* data payload is used for xattr ops */
struct ceph_pagelist *r_pagelist;
/* what caps shall we drop? */
int r_inode_drop, r_inode_unless;
int r_dentry_drop, r_dentry_unless;
int r_old_dentry_drop, r_old_dentry_unless;
struct inode *r_old_inode;
int r_old_inode_drop, r_old_inode_unless;
struct ceph_msg *r_request; /* original request */
int r_request_release_offset;
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
int r_err;
bool r_aborted;
unsigned long r_timeout; /* optional. jiffies */
unsigned long r_started; /* start time to measure timeout against */
unsigned long r_request_started; /* start time for mds request only,
used to measure lease durations */
/* link unsafe requests to parent directory, for fsync */
struct inode *r_unsafe_dir;
struct list_head r_unsafe_dir_item;
struct ceph_mds_session *r_session;
int r_attempts; /* resend attempts */
int r_num_fwd; /* number of forward attempts */
int r_resend_mds; /* mds to resend to next, if any*/
u32 r_sent_on_mseq; /* cap mseq request was sent at*/
struct kref r_kref;
struct list_head r_wait;
struct completion r_completion;
struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback;
struct list_head r_unsafe_item; /* per-session unsafe list item */
bool r_got_unsafe, r_got_safe, r_got_result;
bool r_did_prepopulate;
u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation;
int r_num_caps;
};
/*
* mds client state
*/
struct ceph_mds_client {
struct ceph_fs_client *fsc;
struct mutex mutex; /* all nested structures */
struct ceph_mdsmap *mdsmap;
struct completion safe_umount_waiters;
wait_queue_head_t session_close_wq;
struct list_head waiting_for_map;
struct ceph_mds_session **sessions; /* NULL for mds if no session */
int max_sessions; /* len of s_mds_sessions */
int stopping; /* true if shutting down */
/*
* snap_rwsem will cover cap linkage into snaprealms, and
* realm snap contexts. (later, we can do per-realm snap
* contexts locks..) the empty list contains realms with no
* references (implying they contain no inodes with caps) that
* should be destroyed.
*/
struct rw_semaphore snap_rwsem;
struct rb_root snap_realms;
struct list_head snap_empty;
spinlock_t snap_empty_lock; /* protect snap_empty */
u64 last_tid; /* most recent mds request */
struct rb_root request_tree; /* pending mds requests */
struct delayed_work delayed_work; /* delayed work */
unsigned long last_renew_caps; /* last time we renewed our caps */
struct list_head cap_delay_list; /* caps with delayed release */
spinlock_t cap_delay_lock; /* protects cap_delay_list */
struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock;
u64 cap_flush_seq;
struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
spinlock_t cap_dirty_lock; /* protects above items */
wait_queue_head_t cap_flushing_wq;
/*
* Cap reservations
*
* Maintain a global pool of preallocated struct ceph_caps, referenced
* by struct ceph_caps_reservations. This ensures that we preallocate
* memory needed to successfully process an MDS response. (If an MDS
* sends us cap information and we fail to process it, we will have
* problems due to the client and MDS being out of sync.)
*
* Reservations are 'owned' by a ceph_cap_reservation context.
*/
spinlock_t caps_list_lock;
struct list_head caps_list; /* unused (reserved or
unreserved) */
int caps_total_count; /* total caps allocated */
int caps_use_count; /* in use */
int caps_reserve_count; /* unused, reserved */
int caps_avail_count; /* unused, unreserved */
int caps_min_count; /* keep at least this many
(unreserved) */
spinlock_t dentry_lru_lock;
struct list_head dentry_lru;
int num_dentry;
};
extern const char *ceph_mds_op_name(int op);
extern struct ceph_mds_session *
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
static inline struct ceph_mds_session *
ceph_get_mds_session(struct ceph_mds_session *s)
{
atomic_inc(&s->s_ref);
return s;
}
extern const char *ceph_session_state_name(int s);
extern void ceph_put_mds_session(struct ceph_mds_session *s);
extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
struct ceph_msg *msg, int mds);
extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
struct inode *inode,
struct dentry *dn);
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct inode *dir);
extern struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req);
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
{
kref_get(&req->r_kref);
}
extern void ceph_mdsc_release_request(struct kref *kref);
static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
{
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
int stop_on_nosnap);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
struct inode *inode,
struct dentry *dentry, char action,
u32 seq);
extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
struct ceph_msg *msg);
extern struct ceph_mds_session *
ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
#endif

189
fs/ceph/mdsmap.c Normal file
View file

@ -0,0 +1,189 @@
#include <linux/ceph/ceph_debug.h>
#include <linux/bug.h>
#include <linux/err.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/ceph/mdsmap.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/decode.h>
#include "super.h"
/*
* choose a random mds that is "up" (i.e. has a state > 0), or -1.
*/
int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
{
int n = 0;
int i;
/* special case for one mds */
if (1 == m->m_max_mds && m->m_info[0].state > 0)
return 0;
/* count */
for (i = 0; i < m->m_max_mds; i++)
if (m->m_info[i].state > 0)
n++;
if (n == 0)
return -1;
/* pick */
n = prandom_u32() % n;
i = 0;
for (i = 0; n > 0; i++, n--)
while (m->m_info[i].state <= 0)
i++;
return i;
}
/*
* Decode an MDS map
*
* Ignore any fields we don't care about (there are quite a few of
* them).
*/
struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
{
struct ceph_mdsmap *m;
const void *start = *p;
int i, j, n;
int err = -EINVAL;
u16 version;
m = kzalloc(sizeof(*m), GFP_NOFS);
if (m == NULL)
return ERR_PTR(-ENOMEM);
ceph_decode_16_safe(p, end, version, bad);
if (version > 3) {
pr_warn("got mdsmap version %d > 3, failing", version);
goto bad;
}
ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
m->m_epoch = ceph_decode_32(p);
m->m_client_epoch = ceph_decode_32(p);
m->m_last_failure = ceph_decode_32(p);
m->m_root = ceph_decode_32(p);
m->m_session_timeout = ceph_decode_32(p);
m->m_session_autoclose = ceph_decode_32(p);
m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p);
m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
if (m->m_info == NULL)
goto badmem;
/* pick out active nodes from mds_info (state > 0) */
n = ceph_decode_32(p);
for (i = 0; i < n; i++) {
u64 global_id;
u32 namelen;
s32 mds, inc, state;
u64 state_seq;
u8 infoversion;
struct ceph_entity_addr addr;
u32 num_export_targets;
void *pexport_targets = NULL;
struct ceph_timespec laggy_since;
struct ceph_mds_info *info;
ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
global_id = ceph_decode_64(p);
infoversion = ceph_decode_8(p);
*p += sizeof(u64);
namelen = ceph_decode_32(p); /* skip mds name */
*p += namelen;
ceph_decode_need(p, end,
4*sizeof(u32) + sizeof(u64) +
sizeof(addr) + sizeof(struct ceph_timespec),
bad);
mds = ceph_decode_32(p);
inc = ceph_decode_32(p);
state = ceph_decode_32(p);
state_seq = ceph_decode_64(p);
ceph_decode_copy(p, &addr, sizeof(addr));
ceph_decode_addr(&addr);
ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
*p += sizeof(u32);
ceph_decode_32_safe(p, end, namelen, bad);
*p += namelen;
if (infoversion >= 2) {
ceph_decode_32_safe(p, end, num_export_targets, bad);
pexport_targets = *p;
*p += num_export_targets * sizeof(u32);
} else {
num_export_targets = 0;
}
dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
i+1, n, global_id, mds, inc,
ceph_pr_addr(&addr.in_addr),
ceph_mds_state_name(state));
if (mds < 0 || mds >= m->m_max_mds || state <= 0)
continue;
info = &m->m_info[mds];
info->global_id = global_id;
info->state = state;
info->addr = addr;
info->laggy = (laggy_since.tv_sec != 0 ||
laggy_since.tv_nsec != 0);
info->num_export_targets = num_export_targets;
if (num_export_targets) {
info->export_targets = kcalloc(num_export_targets,
sizeof(u32), GFP_NOFS);
if (info->export_targets == NULL)
goto badmem;
for (j = 0; j < num_export_targets; j++)
info->export_targets[j] =
ceph_decode_32(&pexport_targets);
} else {
info->export_targets = NULL;
}
}
/* pg_pools */
ceph_decode_32_safe(p, end, n, bad);
m->m_num_data_pg_pools = n;
m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
if (!m->m_data_pg_pools)
goto badmem;
ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
for (i = 0; i < n; i++)
m->m_data_pg_pools[i] = ceph_decode_64(p);
m->m_cas_pg_pool = ceph_decode_64(p);
/* ok, we don't care about the rest. */
dout("mdsmap_decode success epoch %u\n", m->m_epoch);
return m;
badmem:
err = -ENOMEM;
bad:
pr_err("corrupt mdsmap\n");
print_hex_dump(KERN_DEBUG, "mdsmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
ceph_mdsmap_destroy(m);
return ERR_PTR(err);
}
void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
for (i = 0; i < m->m_max_mds; i++)
kfree(m->m_info[i].export_targets);
kfree(m->m_info);
kfree(m->m_data_pg_pools);
kfree(m);
}

932
fs/ceph/snap.c Normal file
View file

@ -0,0 +1,932 @@
#include <linux/ceph/ceph_debug.h>
#include <linux/sort.h>
#include <linux/slab.h>
#include "super.h"
#include "mds_client.h"
#include <linux/ceph/decode.h>
/*
* Snapshots in ceph are driven in large part by cooperation from the
* client. In contrast to local file systems or file servers that
* implement snapshots at a single point in the system, ceph's
* distributed access to storage requires clients to help decide
* whether a write logically occurs before or after a recently created
* snapshot.
*
* This provides a perfect instantanous client-wide snapshot. Between
* clients, however, snapshots may appear to be applied at slightly
* different points in time, depending on delays in delivering the
* snapshot notification.
*
* Snapshots are _not_ file system-wide. Instead, each snapshot
* applies to the subdirectory nested beneath some directory. This
* effectively divides the hierarchy into multiple "realms," where all
* of the files contained by each realm share the same set of
* snapshots. An individual realm's snap set contains snapshots
* explicitly created on that realm, as well as any snaps in its
* parent's snap set _after_ the point at which the parent became it's
* parent (due to, say, a rename). Similarly, snaps from prior parents
* during the time intervals during which they were the parent are included.
*
* The client is spared most of this detail, fortunately... it must only
* maintains a hierarchy of realms reflecting the current parent/child
* realm relationship, and for each realm has an explicit list of snaps
* inherited from prior parents.
*
* A snap_realm struct is maintained for realms containing every inode
* with an open cap in the system. (The needed snap realm information is
* provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
* version number is used to ensure that as realm parameters change (new
* snapshot, new parent, etc.) the client's realm hierarchy is updated.
*
* The realm hierarchy drives the generation of a 'snap context' for each
* realm, which simply lists the resulting set of snaps for the realm. This
* is attached to any writes sent to OSDs.
*/
/*
* Unfortunately error handling is a bit mixed here. If we get a snap
* update, but don't have enough memory to update our realm hierarchy,
* it's not clear what we can do about it (besides complaining to the
* console).
*/
/*
* increase ref count for the realm
*
* caller must hold snap_rwsem for write.
*/
void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
dout("get_realm %p %d -> %d\n", realm,
atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
/*
* since we _only_ increment realm refs or empty the empty
* list with snap_rwsem held, adjusting the empty list here is
* safe. we do need to protect against concurrent empty list
* additions, however.
*/
if (atomic_read(&realm->nref) == 0) {
spin_lock(&mdsc->snap_empty_lock);
list_del_init(&realm->empty_item);
spin_unlock(&mdsc->snap_empty_lock);
}
atomic_inc(&realm->nref);
}
static void __insert_snap_realm(struct rb_root *root,
struct ceph_snap_realm *new)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct ceph_snap_realm *r = NULL;
while (*p) {
parent = *p;
r = rb_entry(parent, struct ceph_snap_realm, node);
if (new->ino < r->ino)
p = &(*p)->rb_left;
else if (new->ino > r->ino)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new->node, parent, p);
rb_insert_color(&new->node, root);
}
/*
* create and get the realm rooted at @ino and bump its ref count.
*
* caller must hold snap_rwsem for write.
*/
static struct ceph_snap_realm *ceph_create_snap_realm(
struct ceph_mds_client *mdsc,
u64 ino)
{
struct ceph_snap_realm *realm;
realm = kzalloc(sizeof(*realm), GFP_NOFS);
if (!realm)
return ERR_PTR(-ENOMEM);
atomic_set(&realm->nref, 0); /* tree does not take a ref */
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
INIT_LIST_HEAD(&realm->empty_item);
INIT_LIST_HEAD(&realm->dirty_item);
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
dout("create_snap_realm %llx %p\n", realm->ino, realm);
return realm;
}
/*
* lookup the realm rooted at @ino.
*
* caller must hold snap_rwsem for write.
*/
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino)
{
struct rb_node *n = mdsc->snap_realms.rb_node;
struct ceph_snap_realm *r;
while (n) {
r = rb_entry(n, struct ceph_snap_realm, node);
if (ino < r->ino)
n = n->rb_left;
else if (ino > r->ino)
n = n->rb_right;
else {
dout("lookup_snap_realm %llx %p\n", r->ino, r);
return r;
}
}
return NULL;
}
static void __put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
/*
* called with snap_rwsem (write)
*/
static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
if (realm->parent) {
list_del_init(&realm->child_item);
__put_snap_realm(mdsc, realm->parent);
}
kfree(realm->prior_parent_snaps);
kfree(realm->snaps);
ceph_put_snap_context(realm->cached_context);
kfree(realm);
}
/*
* caller holds snap_rwsem (write)
*/
static void __put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
if (atomic_dec_and_test(&realm->nref))
__destroy_snap_realm(mdsc, realm);
}
/*
* caller needn't hold any locks
*/
void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
if (!atomic_dec_and_test(&realm->nref))
return;
if (down_write_trylock(&mdsc->snap_rwsem)) {
__destroy_snap_realm(mdsc, realm);
up_write(&mdsc->snap_rwsem);
} else {
spin_lock(&mdsc->snap_empty_lock);
list_add(&realm->empty_item, &mdsc->snap_empty);
spin_unlock(&mdsc->snap_empty_lock);
}
}
/*
* Clean up any realms whose ref counts have dropped to zero. Note
* that this does not include realms who were created but not yet
* used.
*
* Called under snap_rwsem (write)
*/
static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
{
struct ceph_snap_realm *realm;
spin_lock(&mdsc->snap_empty_lock);
while (!list_empty(&mdsc->snap_empty)) {
realm = list_first_entry(&mdsc->snap_empty,
struct ceph_snap_realm, empty_item);
list_del(&realm->empty_item);
spin_unlock(&mdsc->snap_empty_lock);
__destroy_snap_realm(mdsc, realm);
spin_lock(&mdsc->snap_empty_lock);
}
spin_unlock(&mdsc->snap_empty_lock);
}
void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
{
down_write(&mdsc->snap_rwsem);
__cleanup_empty_realms(mdsc);
up_write(&mdsc->snap_rwsem);
}
/*
* adjust the parent realm of a given @realm. adjust child list, and parent
* pointers, and ref counts appropriately.
*
* return true if parent was changed, 0 if unchanged, <0 on error.
*
* caller must hold snap_rwsem for write.
*/
static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm,
u64 parentino)
{
struct ceph_snap_realm *parent;
if (realm->parent_ino == parentino)
return 0;
parent = ceph_lookup_snap_realm(mdsc, parentino);
if (!parent) {
parent = ceph_create_snap_realm(mdsc, parentino);
if (IS_ERR(parent))
return PTR_ERR(parent);
}
dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
realm->ino, realm, realm->parent_ino, realm->parent,
parentino, parent);
if (realm->parent) {
list_del_init(&realm->child_item);
ceph_put_snap_realm(mdsc, realm->parent);
}
realm->parent_ino = parentino;
realm->parent = parent;
ceph_get_snap_realm(mdsc, parent);
list_add(&realm->child_item, &parent->children);
return 1;
}
static int cmpu64_rev(const void *a, const void *b)
{
if (*(u64 *)a < *(u64 *)b)
return 1;
if (*(u64 *)a > *(u64 *)b)
return -1;
return 0;
}
/*
* build the snap context for a given realm.
*/
static int build_snap_context(struct ceph_snap_realm *realm)
{
struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc;
int err = 0;
u32 num = realm->num_prior_parent_snaps + realm->num_snaps;
/*
* build parent context, if it hasn't been built.
* conservatively estimate that all parent snaps might be
* included by us.
*/
if (parent) {
if (!parent->cached_context) {
err = build_snap_context(parent);
if (err)
goto fail;
}
num += parent->cached_context->num_snaps;
}
/* do i actually need to update? not if my context seq
matches realm seq, and my parents' does to. (this works
because we rebuild_snap_realms() works _downward_ in
hierarchy after each update.) */
if (realm->cached_context &&
realm->cached_context->seq == realm->seq &&
(!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) {
dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
" (unchanged)\n",
realm->ino, realm, realm->cached_context,
realm->cached_context->seq,
(unsigned int) realm->cached_context->num_snaps);
return 0;
}
/* alloc new snap context */
err = -ENOMEM;
if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
goto fail;
snapc = ceph_create_snap_context(num, GFP_NOFS);
if (!snapc)
goto fail;
/* build (reverse sorted) snap vector */
num = 0;
snapc->seq = realm->seq;
if (parent) {
u32 i;
/* include any of parent's snaps occurring _after_ my
parent became my parent */
for (i = 0; i < parent->cached_context->num_snaps; i++)
if (parent->cached_context->snaps[i] >=
realm->parent_since)
snapc->snaps[num++] =
parent->cached_context->snaps[i];
if (parent->cached_context->seq > snapc->seq)
snapc->seq = parent->cached_context->seq;
}
memcpy(snapc->snaps + num, realm->snaps,
sizeof(u64)*realm->num_snaps);
num += realm->num_snaps;
memcpy(snapc->snaps + num, realm->prior_parent_snaps,
sizeof(u64)*realm->num_prior_parent_snaps);
num += realm->num_prior_parent_snaps;
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
snapc->num_snaps = num;
dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
realm->ino, realm, snapc, snapc->seq,
(unsigned int) snapc->num_snaps);
if (realm->cached_context)
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
return 0;
fail:
/*
* if we fail, clear old (incorrect) cached_context... hopefully
* we'll have better luck building it later
*/
if (realm->cached_context) {
ceph_put_snap_context(realm->cached_context);
realm->cached_context = NULL;
}
pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
realm, err);
return err;
}
/*
* rebuild snap context for the given realm and all of its children.
*/
static void rebuild_snap_realms(struct ceph_snap_realm *realm)
{
struct ceph_snap_realm *child;
dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
build_snap_context(realm);
list_for_each_entry(child, &realm->children, child_item)
rebuild_snap_realms(child);
}
/*
* helper to allocate and decode an array of snapids. free prior
* instance, if any.
*/
static int dup_array(u64 **dst, __le64 *src, u32 num)
{
u32 i;
kfree(*dst);
if (num) {
*dst = kcalloc(num, sizeof(u64), GFP_NOFS);
if (!*dst)
return -ENOMEM;
for (i = 0; i < num; i++)
(*dst)[i] = get_unaligned_le64(src + i);
} else {
*dst = NULL;
}
return 0;
}
/*
* When a snapshot is applied, the size/mtime inode metadata is queued
* in a ceph_cap_snap (one for each snapshot) until writeback
* completes and the metadata can be flushed back to the MDS.
*
* However, if a (sync) write is currently in-progress when we apply
* the snapshot, we have to wait until the write succeeds or fails
* (and a final size/mtime is known). In this case the
* cap_snap->writing = 1, and is said to be "pending." When the write
* finishes, we __ceph_finish_cap_snap().
*
* Caller must hold snap_rwsem for read (i.e., the realm topology won't
* change).
*/
void ceph_queue_cap_snap(struct ceph_inode_info *ci)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_cap_snap *capsnap;
int used, dirty;
capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
if (!capsnap) {
pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
return;
}
spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
/*
* If there is a write in progress, treat that as a dirty Fw,
* even though it hasn't completed yet; by the time we finish
* up this capsnap it will be.
*/
if (used & CEPH_CAP_FILE_WR)
dirty |= CEPH_CAP_FILE_WR;
if (__ceph_have_pending_cap_snap(ci)) {
/* there is no point in queuing multiple "pending" cap_snaps,
as no new writes are allowed to start when pending, so any
writes in progress now were started before the previous
cap_snap. lucky us. */
dout("queue_cap_snap %p already pending\n", inode);
kfree(capsnap);
} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
struct ceph_snap_context *snapc = ci->i_head_snapc;
/*
* if we are a sync write, we may need to go to the snaprealm
* to get the current snapc.
*/
if (!snapc)
snapc = ci->i_snap_realm->cached_context;
dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
inode, capsnap, snapc, ceph_cap_string(dirty));
ihold(inode);
atomic_set(&capsnap->nref, 1);
capsnap->ci = ci;
INIT_LIST_HEAD(&capsnap->ci_item);
INIT_LIST_HEAD(&capsnap->flushing_item);
capsnap->follows = snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
capsnap->dirty = dirty;
capsnap->mode = inode->i_mode;
capsnap->uid = inode->i_uid;
capsnap->gid = inode->i_gid;
if (dirty & CEPH_CAP_XATTR_EXCL) {
__ceph_build_xattrs_blob(ci);
capsnap->xattr_blob =
ceph_buffer_get(ci->i_xattrs.blob);
capsnap->xattr_version = ci->i_xattrs.version;
} else {
capsnap->xattr_blob = NULL;
capsnap->xattr_version = 0;
}
/* dirty page count moved from _head to this cap_snap;
all subsequent writes page dirties occur _after_ this
snapshot. */
capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
ci->i_wrbuffer_ref_head = 0;
capsnap->context = snapc;
ci->i_head_snapc =
ceph_get_snap_context(ci->i_snap_realm->cached_context);
dout(" new snapc is %p\n", ci->i_head_snapc);
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
if (used & CEPH_CAP_FILE_WR) {
dout("queue_cap_snap %p cap_snap %p snapc %p"
" seq %llu used WR, now pending\n", inode,
capsnap, snapc, snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
__ceph_finish_cap_snap(ci, capsnap);
}
} else {
dout("queue_cap_snap %p nothing dirty|writing\n", inode);
kfree(capsnap);
}
spin_unlock(&ci->i_ceph_lock);
}
/*
* Finalize the size, mtime for a cap_snap.. that is, settle on final values
* to be used for the snapshot, to be flushed back to the mds.
*
* If capsnap can now be flushed, add to snap_flush list, and return 1.
*
* Caller must hold i_ceph_lock.
*/
int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
BUG_ON(capsnap->writing);
capsnap->size = inode->i_size;
capsnap->mtime = inode->i_mtime;
capsnap->atime = inode->i_atime;
capsnap->ctime = inode->i_ctime;
capsnap->time_warp_seq = ci->i_time_warp_seq;
if (capsnap->dirty_pages) {
dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
"still has %d dirty pages\n", inode, capsnap,
capsnap->context, capsnap->context->seq,
ceph_cap_string(capsnap->dirty), capsnap->size,
capsnap->dirty_pages);
return 0;
}
dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
inode, capsnap, capsnap->context,
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
spin_unlock(&mdsc->snap_flush_lock);
return 1; /* caller may want to ceph_flush_snaps */
}
/*
* Queue cap_snaps for snap writeback for this realm and its children.
* Called under snap_rwsem, so realm topology won't change.
*/
static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
{
struct ceph_inode_info *ci;
struct inode *lastinode = NULL;
struct ceph_snap_realm *child;
dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps,
i_snap_realm_item) {
struct inode *inode = igrab(&ci->vfs_inode);
if (!inode)
continue;
spin_unlock(&realm->inodes_with_caps_lock);
if (lastinode)
iput(lastinode);
lastinode = inode;
ceph_queue_cap_snap(ci);
spin_lock(&realm->inodes_with_caps_lock);
}
spin_unlock(&realm->inodes_with_caps_lock);
if (lastinode)
iput(lastinode);
list_for_each_entry(child, &realm->children, child_item) {
dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
realm, realm->ino, child, child->ino);
list_del_init(&child->dirty_item);
list_add(&child->dirty_item, &realm->dirty_item);
}
list_del_init(&realm->dirty_item);
dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
}
/*
* Parse and apply a snapblob "snap trace" from the MDS. This specifies
* the snap realm parameters from a given realm and all of its ancestors,
* up to the root.
*
* Caller must hold snap_rwsem for write.
*/
int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
void *p, void *e, bool deletion)
{
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
struct ceph_snap_realm *realm;
int invalidate = 0;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
dout("update_snap_trace deletion=%d\n", deletion);
more:
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
le32_to_cpu(ri->num_prior_parent_snaps)), bad);
snaps = p;
p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
prior_parent_snaps = p;
p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
if (!realm) {
realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
if (IS_ERR(realm)) {
err = PTR_ERR(realm);
goto fail;
}
}
/* ensure the parent is correct */
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
if (err < 0)
goto fail;
invalidate += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
dout("update_snap_trace updating %llx %p %lld -> %lld\n",
realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
/* update realm parameters, snap lists */
realm->seq = le64_to_cpu(ri->seq);
realm->created = le64_to_cpu(ri->created);
realm->parent_since = le64_to_cpu(ri->parent_since);
realm->num_snaps = le32_to_cpu(ri->num_snaps);
err = dup_array(&realm->snaps, snaps, realm->num_snaps);
if (err < 0)
goto fail;
realm->num_prior_parent_snaps =
le32_to_cpu(ri->num_prior_parent_snaps);
err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
realm->num_prior_parent_snaps);
if (err < 0)
goto fail;
/* queue realm for cap_snap creation */
list_add(&realm->dirty_item, &dirty_realms);
invalidate = 1;
} else if (!realm->cached_context) {
dout("update_snap_trace %llx %p seq %lld new\n",
realm->ino, realm, realm->seq);
invalidate = 1;
} else {
dout("update_snap_trace %llx %p seq %lld unchanged\n",
realm->ino, realm, realm->seq);
}
dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
realm, invalidate, p, e);
if (p < e)
goto more;
/* invalidate when we reach the _end_ (root) of the trace */
if (invalidate)
rebuild_snap_realms(realm);
/*
* queue cap snaps _after_ we've built the new snap contexts,
* so that i_head_snapc can be set appropriately.
*/
while (!list_empty(&dirty_realms)) {
realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
dirty_item);
queue_realm_cap_snaps(realm);
}
__cleanup_empty_realms(mdsc);
return 0;
bad:
err = -EINVAL;
fail:
pr_err("update_snap_trace error %d\n", err);
return err;
}
/*
* Send any cap_snaps that are queued for flush. Try to carry
* s_mutex across multiple snap flushes to avoid locking overhead.
*
* Caller holds no locks.
*/
static void flush_snaps(struct ceph_mds_client *mdsc)
{
struct ceph_inode_info *ci;
struct inode *inode;
struct ceph_mds_session *session = NULL;
dout("flush_snaps\n");
spin_lock(&mdsc->snap_flush_lock);
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
struct ceph_inode_info, i_snap_flush_item);
inode = &ci->vfs_inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
spin_lock(&ci->i_ceph_lock);
__ceph_flush_snaps(ci, &session, 0);
spin_unlock(&ci->i_ceph_lock);
iput(inode);
spin_lock(&mdsc->snap_flush_lock);
}
spin_unlock(&mdsc->snap_flush_lock);
if (session) {
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
}
dout("flush_snaps done\n");
}
/*
* Handle a snap notification from the MDS.
*
* This can take two basic forms: the simplest is just a snap creation
* or deletion notification on an existing realm. This should update the
* realm and its children.
*
* The more difficult case is realm creation, due to snap creation at a
* new point in the file hierarchy, or due to a rename that moves a file or
* directory into another realm.
*/
void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct super_block *sb = mdsc->fsc->sb;
int mds = session->s_mds;
u64 split;
int op;
int trace_len;
struct ceph_snap_realm *realm = NULL;
void *p = msg->front.iov_base;
void *e = p + msg->front.iov_len;
struct ceph_mds_snap_head *h;
int num_split_inos, num_split_realms;
__le64 *split_inos = NULL, *split_realms = NULL;
int i;
int locked_rwsem = 0;
/* decode */
if (msg->front.iov_len < sizeof(*h))
goto bad;
h = p;
op = le32_to_cpu(h->op);
split = le64_to_cpu(h->split); /* non-zero if we are splitting an
* existing realm */
num_split_inos = le32_to_cpu(h->num_split_inos);
num_split_realms = le32_to_cpu(h->num_split_realms);
trace_len = le32_to_cpu(h->trace_len);
p += sizeof(*h);
dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
ceph_snap_op_name(op), split, trace_len);
mutex_lock(&session->s_mutex);
session->s_seq++;
mutex_unlock(&session->s_mutex);
down_write(&mdsc->snap_rwsem);
locked_rwsem = 1;
if (op == CEPH_SNAP_OP_SPLIT) {
struct ceph_mds_snap_realm *ri;
/*
* A "split" breaks part of an existing realm off into
* a new realm. The MDS provides a list of inodes
* (with caps) and child realms that belong to the new
* child.
*/
split_inos = p;
p += sizeof(u64) * num_split_inos;
split_realms = p;
p += sizeof(u64) * num_split_realms;
ceph_decode_need(&p, e, sizeof(*ri), bad);
/* we will peek at realm info here, but will _not_
* advance p, as the realm update will occur below in
* ceph_update_snap_trace. */
ri = p;
realm = ceph_lookup_snap_realm(mdsc, split);
if (!realm) {
realm = ceph_create_snap_realm(mdsc, split);
if (IS_ERR(realm))
goto out;
}
ceph_get_snap_realm(mdsc, realm);
dout("splitting snap_realm %llx %p\n", realm->ino, realm);
for (i = 0; i < num_split_inos; i++) {
struct ceph_vino vino = {
.ino = le64_to_cpu(split_inos[i]),
.snap = CEPH_NOSNAP,
};
struct inode *inode = ceph_find_inode(sb, vino);
struct ceph_inode_info *ci;
struct ceph_snap_realm *oldrealm;
if (!inode)
continue;
ci = ceph_inode(inode);
spin_lock(&ci->i_ceph_lock);
if (!ci->i_snap_realm)
goto skip_inode;
/*
* If this inode belongs to a realm that was
* created after our new realm, we experienced
* a race (due to another split notifications
* arriving from a different MDS). So skip
* this inode.
*/
if (ci->i_snap_realm->created >
le64_to_cpu(ri->created)) {
dout(" leaving %p in newer realm %llx %p\n",
inode, ci->i_snap_realm->ino,
ci->i_snap_realm);
goto skip_inode;
}
dout(" will move %p to split realm %llx %p\n",
inode, realm->ino, realm);
/*
* Move the inode to the new realm
*/
spin_lock(&realm->inodes_with_caps_lock);
list_del_init(&ci->i_snap_realm_item);
list_add(&ci->i_snap_realm_item,
&realm->inodes_with_caps);
oldrealm = ci->i_snap_realm;
ci->i_snap_realm = realm;
spin_unlock(&realm->inodes_with_caps_lock);
spin_unlock(&ci->i_ceph_lock);
ceph_get_snap_realm(mdsc, realm);
ceph_put_snap_realm(mdsc, oldrealm);
iput(inode);
continue;
skip_inode:
spin_unlock(&ci->i_ceph_lock);
iput(inode);
}
/* we may have taken some of the old realm's children. */
for (i = 0; i < num_split_realms; i++) {
struct ceph_snap_realm *child =
ceph_lookup_snap_realm(mdsc,
le64_to_cpu(split_realms[i]));
if (!child)
continue;
adjust_snap_realm_parent(mdsc, child, realm->ino);
}
}
/*
* update using the provided snap trace. if we are deleting a
* snap, we can avoid queueing cap_snaps.
*/
ceph_update_snap_trace(mdsc, p, e,
op == CEPH_SNAP_OP_DESTROY);
if (op == CEPH_SNAP_OP_SPLIT)
/* we took a reference when we created the realm, above */
ceph_put_snap_realm(mdsc, realm);
__cleanup_empty_realms(mdsc);
up_write(&mdsc->snap_rwsem);
flush_snaps(mdsc);
return;
bad:
pr_err("corrupt snap message from mds%d\n", mds);
ceph_msg_dump(msg);
out:
if (locked_rwsem)
up_write(&mdsc->snap_rwsem);
return;
}

124
fs/ceph/strings.c Normal file
View file

@ -0,0 +1,124 @@
/*
* Ceph fs string constants
*/
#include <linux/module.h>
#include <linux/ceph/types.h>
const char *ceph_mds_state_name(int s)
{
switch (s) {
/* down and out */
case CEPH_MDS_STATE_DNE: return "down:dne";
case CEPH_MDS_STATE_STOPPED: return "down:stopped";
/* up and out */
case CEPH_MDS_STATE_BOOT: return "up:boot";
case CEPH_MDS_STATE_STANDBY: return "up:standby";
case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
case CEPH_MDS_STATE_CREATING: return "up:creating";
case CEPH_MDS_STATE_STARTING: return "up:starting";
/* up and in */
case CEPH_MDS_STATE_REPLAY: return "up:replay";
case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
case CEPH_MDS_STATE_ACTIVE: return "up:active";
case CEPH_MDS_STATE_STOPPING: return "up:stopping";
}
return "???";
}
const char *ceph_session_op_name(int op)
{
switch (op) {
case CEPH_SESSION_REQUEST_OPEN: return "request_open";
case CEPH_SESSION_OPEN: return "open";
case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
case CEPH_SESSION_CLOSE: return "close";
case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
case CEPH_SESSION_RENEWCAPS: return "renewcaps";
case CEPH_SESSION_STALE: return "stale";
case CEPH_SESSION_RECALL_STATE: return "recall_state";
case CEPH_SESSION_FLUSHMSG: return "flushmsg";
case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
}
return "???";
}
const char *ceph_mds_op_name(int op)
{
switch (op) {
case CEPH_MDS_OP_LOOKUP: return "lookup";
case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
case CEPH_MDS_OP_READDIR: return "readdir";
case CEPH_MDS_OP_MKNOD: return "mknod";
case CEPH_MDS_OP_LINK: return "link";
case CEPH_MDS_OP_UNLINK: return "unlink";
case CEPH_MDS_OP_RENAME: return "rename";
case CEPH_MDS_OP_MKDIR: return "mkdir";
case CEPH_MDS_OP_RMDIR: return "rmdir";
case CEPH_MDS_OP_SYMLINK: return "symlink";
case CEPH_MDS_OP_CREATE: return "create";
case CEPH_MDS_OP_OPEN: return "open";
case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
case CEPH_MDS_OP_LSSNAP: return "lssnap";
case CEPH_MDS_OP_MKSNAP: return "mksnap";
case CEPH_MDS_OP_RMSNAP: return "rmsnap";
case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
}
return "???";
}
const char *ceph_cap_op_name(int op)
{
switch (op) {
case CEPH_CAP_OP_GRANT: return "grant";
case CEPH_CAP_OP_REVOKE: return "revoke";
case CEPH_CAP_OP_TRUNC: return "trunc";
case CEPH_CAP_OP_EXPORT: return "export";
case CEPH_CAP_OP_IMPORT: return "import";
case CEPH_CAP_OP_UPDATE: return "update";
case CEPH_CAP_OP_DROP: return "drop";
case CEPH_CAP_OP_FLUSH: return "flush";
case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
case CEPH_CAP_OP_RELEASE: return "release";
case CEPH_CAP_OP_RENEW: return "renew";
}
return "???";
}
const char *ceph_lease_op_name(int o)
{
switch (o) {
case CEPH_MDS_LEASE_REVOKE: return "revoke";
case CEPH_MDS_LEASE_RELEASE: return "release";
case CEPH_MDS_LEASE_RENEW: return "renew";
case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
}
return "???";
}
const char *ceph_snap_op_name(int o)
{
switch (o) {
case CEPH_SNAP_OP_UPDATE: return "update";
case CEPH_SNAP_OP_CREATE: return "create";
case CEPH_SNAP_OP_DESTROY: return "destroy";
case CEPH_SNAP_OP_SPLIT: return "split";
}
return "???";
}

1061
fs/ceph/super.c Normal file

File diff suppressed because it is too large Load diff

906
fs/ceph/super.h Normal file
View file

@ -0,0 +1,906 @@
#ifndef _FS_CEPH_SUPER_H
#define _FS_CEPH_SUPER_H
#include <linux/ceph/ceph_debug.h>
#include <asm/unaligned.h>
#include <linux/backing-dev.h>
#include <linux/completion.h>
#include <linux/exportfs.h>
#include <linux/fs.h>
#include <linux/mempool.h>
#include <linux/pagemap.h>
#include <linux/wait.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/posix_acl.h>
#include <linux/ceph/libceph.h>
#ifdef CONFIG_CEPH_FSCACHE
#include <linux/fscache.h>
#endif
/* f_type in struct statfs */
#define CEPH_SUPER_MAGIC 0x00c36400
/* large granularity for statfs utilization stats to facilitate
* large volume sizes on 32-bit machines. */
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
#define ceph_set_mount_opt(fsc, opt) \
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
#define CEPH_RSIZE_DEFAULT 0 /* max read size */
#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */
#define CEPH_MAX_READDIR_DEFAULT 1024
#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
struct ceph_mount_options {
int flags;
int sb_flags;
int wsize; /* max write size */
int rsize; /* max read size */
int rasize; /* max readahead */
int congestion_kb; /* max writeback in flight */
int caps_wanted_delay_min, caps_wanted_delay_max;
int cap_release_safety;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
/*
* everything above this point can be memcmp'd; everything below
* is handled in compare_mount_options()
*/
char *snapdir_name; /* default ".snap" */
};
struct ceph_fs_client {
struct super_block *sb;
struct ceph_mount_options *mount_options;
struct ceph_client *client;
unsigned long mount_state;
int min_caps; /* min caps i added */
struct ceph_mds_client *mdsc;
/* writeback */
mempool_t *wb_pagevec_pool;
struct workqueue_struct *wb_wq;
struct workqueue_struct *pg_inv_wq;
struct workqueue_struct *trunc_wq;
atomic_long_t writeback_count;
struct backing_dev_info backing_dev_info;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
struct dentry *debugfs_bdi;
struct dentry *debugfs_mdsc, *debugfs_mdsmap;
struct dentry *debugfs_mds_sessions;
#endif
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_cookie *fscache;
struct workqueue_struct *revalidate_wq;
#endif
};
/*
* File i/o capability. This tracks shared state with the metadata
* server that allows us to cache or writeback attributes or to read
* and write data. For any given inode, we should have one or more
* capabilities, one issued by each metadata server, and our
* cumulative access is the OR of all issued capabilities.
*
* Each cap is referenced by the inode's i_caps rbtree and by per-mds
* session capability lists.
*/
struct ceph_cap {
struct ceph_inode_info *ci;
struct rb_node ci_node; /* per-ci cap tree */
struct ceph_mds_session *session;
struct list_head session_caps; /* per-session caplist */
int mds;
u64 cap_id; /* unique cap id (mds provided) */
int issued; /* latest, from the mds */
int implemented; /* implemented superset of issued (for revocation) */
int mds_wanted;
u32 seq, issue_seq, mseq;
u32 cap_gen; /* active/stale cycle */
unsigned long last_used;
struct list_head caps_item;
};
#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
/*
* Snapped cap state that is pending flush to mds. When a snapshot occurs,
* we first complete any in-process sync writes and writeback any dirty
* data before flushing the snapped state (tracked here) back to the MDS.
*/
struct ceph_cap_snap {
atomic_t nref;
struct ceph_inode_info *ci;
struct list_head ci_item, flushing_item;
u64 follows, flush_tid;
int issued, dirty;
struct ceph_snap_context *context;
umode_t mode;
kuid_t uid;
kgid_t gid;
struct ceph_buffer *xattr_blob;
u64 xattr_version;
u64 size;
struct timespec mtime, atime, ctime;
u64 time_warp_seq;
int writing; /* a sync write is still in progress */
int dirty_pages; /* dirty pages awaiting writeback */
};
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
{
if (atomic_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
kfree(capsnap);
}
}
/*
* The frag tree describes how a directory is fragmented, potentially across
* multiple metadata servers. It is also used to indicate points where
* metadata authority is delegated, and whether/where metadata is replicated.
*
* A _leaf_ frag will be present in the i_fragtree IFF there is
* delegation info. That is, if mds >= 0 || ndist > 0.
*/
#define CEPH_MAX_DIRFRAG_REP 4
struct ceph_inode_frag {
struct rb_node node;
/* fragtree state */
u32 frag;
int split_by; /* i.e. 2^(split_by) children */
/* delegation and replication info */
int mds; /* -1 if same authority as parent */
int ndist; /* >0 if replicated */
int dist[CEPH_MAX_DIRFRAG_REP];
};
/*
* We cache inode xattrs as an encoded blob until they are first used,
* at which point we parse them into an rbtree.
*/
struct ceph_inode_xattr {
struct rb_node node;
const char *name;
int name_len;
const char *val;
int val_len;
int dirty;
int should_free_name;
int should_free_val;
};
/*
* Ceph dentry state
*/
struct ceph_dentry_info {
struct ceph_mds_session *lease_session;
u32 lease_gen, lease_shared_gen;
u32 lease_seq;
unsigned long lease_renew_after, lease_renew_from;
struct list_head lru;
struct dentry *dentry;
u64 time;
u64 offset;
};
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
* this until someone actually calls getxattr, etc.
*
* blob->vec.iov_len == 4 implies there are no xattrs; blob ==
* NULL means we don't know.
*/
struct ceph_buffer *blob, *prealloc_blob;
struct rb_root index;
bool dirty;
int count;
int names_size;
int vals_size;
u64 version, index_version;
};
/*
* Ceph inode.
*/
struct ceph_inode_info {
struct ceph_vino i_vino; /* ceph ino + snap */
spinlock_t i_ceph_lock;
u64 i_version;
u32 i_time_warp_seq;
unsigned i_ceph_flags;
atomic_t i_release_count;
atomic_t i_complete_count;
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
char *i_symlink;
/* for dirs */
struct timespec i_rctime;
u64 i_rbytes, i_rfiles, i_rsubdirs;
u64 i_files, i_subdirs;
struct rb_root i_fragtree;
struct mutex i_fragtree_mutex;
struct ceph_inode_xattrs_info i_xattrs;
/* capabilities. protected _both_ by i_ceph_lock and cap->session's
* s_mutex. */
struct rb_root i_caps; /* cap list */
struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
struct list_head i_dirty_item, i_flushing_item;
u64 i_cap_flush_seq;
/* we need to track cap writeback on a per-cap-bit basis, to allow
* overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */
u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
struct ceph_cap_reservation i_cap_migration_resv;
struct list_head i_cap_snaps; /* snapped state pending flush to mds */
struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
struct mutex i_truncate_mutex;
u32 i_truncate_seq; /* last truncate to smaller size */
u64 i_truncate_size; /* and the size we last truncated down to */
int i_truncate_pending; /* still need to call vmtruncate */
u64 i_max_size; /* max file size authorized by mds */
u64 i_reported_size; /* (max_)size reported to or requested of mds */
u64 i_wanted_max_size; /* offset we'd like to write too */
u64 i_requested_max_size; /* max_size we've requested */
/* held references to caps */
int i_pin_ref;
int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
int i_wrbuffer_ref, i_wrbuffer_ref_head;
u32 i_shared_gen; /* increment each time we get FILE_SHARED */
u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
struct list_head i_unsafe_writes; /* uncommitted sync writes */
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
spinlock_t i_unsafe_lock;
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
int i_snap_realm_counter; /* snap realm (if caps) */
struct list_head i_snap_realm_item;
struct list_head i_snap_flush_item;
struct work_struct i_wb_work; /* writeback work */
struct work_struct i_pg_inv_work; /* page invalidation work */
struct work_struct i_vmtruncate_work;
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_cookie *fscache;
u32 i_fscache_gen; /* sequence, for delayed fscache validate */
struct work_struct i_revalidate_work;
#endif
struct inode vfs_inode; /* at end */
};
static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
{
return container_of(inode, struct ceph_inode_info, vfs_inode);
}
static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
{
return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
}
static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
{
return (struct ceph_fs_client *)sb->s_fs_info;
}
static inline struct ceph_vino ceph_vino(struct inode *inode)
{
return ceph_inode(inode)->i_vino;
}
/*
* ino_t is <64 bits on many architectures, blech.
*
* i_ino (kernel inode) st_ino (userspace)
* i386 32 32
* x86_64+ino32 64 32
* x86_64 64 64
*/
static inline u32 ceph_ino_to_ino32(__u64 vino)
{
u32 ino = vino & 0xffffffff;
ino ^= vino >> 32;
if (!ino)
ino = 2;
return ino;
}
/*
* kernel i_ino value
*/
static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
{
#if BITS_PER_LONG == 32
return ceph_ino_to_ino32(vino.ino);
#else
return (ino_t)vino.ino;
#endif
}
/*
* user-visible ino (stat, filldir)
*/
#if BITS_PER_LONG == 32
static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
{
return ino;
}
#else
static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
{
if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
ino = ceph_ino_to_ino32(ino);
return ino;
}
#endif
/* for printf-style formatting */
#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
static inline u64 ceph_ino(struct inode *inode)
{
return ceph_inode(inode)->i_vino.ino;
}
static inline u64 ceph_snap(struct inode *inode)
{
return ceph_inode(inode)->i_vino.snap;
}
static inline int ceph_ino_compare(struct inode *inode, void *data)
{
struct ceph_vino *pvino = (struct ceph_vino *)data;
struct ceph_inode_info *ci = ceph_inode(inode);
return ci->i_vino.ino == pvino->ino &&
ci->i_vino.snap == pvino->snap;
}
static inline struct inode *ceph_find_inode(struct super_block *sb,
struct ceph_vino vino)
{
ino_t t = ceph_vino_to_ino(vino);
return ilookup5(sb, t, ceph_ino_compare, &vino);
}
/*
* Ceph inode.
*/
#define CEPH_I_NODELAY 4 /* do not delay cap release */
#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
int release_count)
{
atomic_set(&ci->i_complete_count, release_count);
}
static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
{
atomic_inc(&ci->i_release_count);
}
static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
{
return atomic_read(&ci->i_complete_count) ==
atomic_read(&ci->i_release_count);
}
static inline void ceph_dir_clear_complete(struct inode *inode)
{
__ceph_dir_clear_complete(ceph_inode(inode));
}
static inline bool ceph_dir_is_complete(struct inode *inode)
{
return __ceph_dir_is_complete(ceph_inode(inode));
}
/* find a specific frag @f */
extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
u32 f);
/*
* choose fragment for value @v. copy frag content to pfrag, if leaf
* exists
*/
extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
struct ceph_inode_frag *pfrag,
int *found);
static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
{
return (struct ceph_dentry_info *)dentry->d_fsdata;
}
static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
{
return ((loff_t)frag << 32) | (loff_t)off;
}
/*
* caps helpers
*/
static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
{
return !RB_EMPTY_ROOT(&ci->i_caps);
}
extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
struct ceph_cap *cap);
static inline int ceph_caps_issued(struct ceph_inode_info *ci)
{
int issued;
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
spin_unlock(&ci->i_ceph_lock);
return issued;
}
static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
int touch)
{
int r;
spin_lock(&ci->i_ceph_lock);
r = __ceph_caps_issued_mask(ci, mask, touch);
spin_unlock(&ci->i_ceph_lock);
return r;
}
static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
{
return ci->i_dirty_caps | ci->i_flushing_caps;
}
extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
struct ceph_cap *ocap, int mask);
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
extern int __ceph_caps_used(struct ceph_inode_info *ci);
extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
/*
* wanted, by virtue of open file modes AND cap refs (buffered/cached data)
*/
static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
{
int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
if (w & CEPH_CAP_FILE_BUFFER)
w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
return w;
}
/* what the mds thinks we want */
extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
extern void ceph_reservation_status(struct ceph_fs_client *client,
int *total, int *avail, int *used,
int *reserved, int *min);
/*
* we keep buffered readdir results attached to file->private_data
*/
#define CEPH_F_SYNC 1
#define CEPH_F_ATEND 2
struct ceph_file_info {
short fmode; /* initialized on open */
short flags; /* CEPH_F_* */
/* readdir: position within the dir */
u32 frag;
struct ceph_mds_request *last_readdir;
/* readdir: position within a frag */
unsigned offset; /* offset of last chunk, adjusted for . and .. */
unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */
struct dentry *dentry; /* next dentry (for dcache readdir) */
int dir_release_count;
/* used for -o dirstat read() on directory thing */
char *dir_info;
int dir_info_len;
};
/*
* A "snap realm" describes a subset of the file hierarchy sharing
* the same set of snapshots that apply to it. The realms themselves
* are organized into a hierarchy, such that children inherit (some of)
* the snapshots of their parents.
*
* All inodes within the realm that have capabilities are linked into a
* per-realm list.
*/
struct ceph_snap_realm {
u64 ino;
atomic_t nref;
struct rb_node node;
u64 created, seq;
u64 parent_ino;
u64 parent_since; /* snapid when our current parent became so */
u64 *prior_parent_snaps; /* snaps inherited from any parents we */
u32 num_prior_parent_snaps; /* had prior to parent_since */
u64 *snaps; /* snaps specific to this realm */
u32 num_snaps;
struct ceph_snap_realm *parent;
struct list_head children; /* list of child realms */
struct list_head child_item;
struct list_head empty_item; /* if i have ref==0 */
struct list_head dirty_item; /* if realm needs new context */
/* the current set of snaps for this realm */
struct ceph_snap_context *cached_context;
struct list_head inodes_with_caps;
spinlock_t inodes_with_caps_lock;
};
static inline int default_congestion_kb(void)
{
int congestion_kb;
/*
* Copied from NFS
*
* congestion size, scale with available memory.
*
* 64MB: 8192k
* 128MB: 11585k
* 256MB: 16384k
* 512MB: 23170k
* 1GB: 32768k
* 2GB: 46340k
* 4GB: 65536k
* 8GB: 92681k
* 16GB: 131072k
*
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
if (congestion_kb > 256*1024)
congestion_kb = 256*1024;
return congestion_kb;
}
/* snap.c */
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino);
extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
extern int ceph_update_snap_trace(struct ceph_mds_client *m,
void *p, void *e, bool deletion);
extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
/*
* a cap_snap is "pending" if it is still awaiting an in-progress
* sync write (that may/may not still update size, mtime, etc.).
*/
static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
{
return !list_empty(&ci->i_cap_snaps) &&
list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
ci_item)->writing;
}
/* inode.c */
extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
extern void ceph_destroy_inode(struct inode *inode);
extern int ceph_drop_inode(struct inode *inode);
extern struct inode *ceph_get_inode(struct super_block *sb,
struct ceph_vino vino);
extern struct inode *ceph_get_snapdir(struct inode *parent);
extern int ceph_fill_file_size(struct inode *inode, int issued,
u32 truncate_seq, u64 truncate_size, u64 size);
extern void ceph_fill_file_time(struct inode *inode, int issued,
u64 time_warp_seq, struct timespec *ctime,
struct timespec *mtime, struct timespec *atime);
extern int ceph_fill_trace(struct super_block *sb,
struct ceph_mds_request *req,
struct ceph_mds_session *session);
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session);
extern int ceph_inode_holds_cap(struct inode *inode, int mask);
extern int ceph_inode_set_size(struct inode *inode, loff_t size);
extern void __ceph_do_pending_vmtruncate(struct inode *inode);
extern void ceph_queue_vmtruncate(struct inode *inode);
extern void ceph_queue_invalidate(struct inode *inode);
extern void ceph_queue_writeback(struct inode *inode);
extern int ceph_do_getattr(struct inode *inode, int mask, bool force);
extern int ceph_permission(struct inode *inode, int mask);
extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
/* xattr.c */
extern int ceph_setxattr(struct dentry *, const char *, const void *,
size_t, int);
int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
int __ceph_removexattr(struct dentry *, const char *);
extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern int ceph_removexattr(struct dentry *, const char *);
extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
extern void __init ceph_xattr_init(void);
extern void ceph_xattr_exit(void);
extern const struct xattr_handler *ceph_xattr_handlers[];
/* acl.c */
struct ceph_acls_info {
void *default_acl;
void *acl;
struct ceph_pagelist *pagelist;
};
#ifdef CONFIG_CEPH_FS_POSIX_ACL
struct posix_acl *ceph_get_acl(struct inode *, int);
int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
struct ceph_acls_info *info);
void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info);
void ceph_release_acls_info(struct ceph_acls_info *info);
static inline void ceph_forget_all_cached_acls(struct inode *inode)
{
forget_all_cached_acls(inode);
}
#else
#define ceph_get_acl NULL
#define ceph_set_acl NULL
static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
struct ceph_acls_info *info)
{
return 0;
}
static inline void ceph_init_inode_acls(struct inode *inode,
struct ceph_acls_info *info)
{
}
static inline void ceph_release_acls_info(struct ceph_acls_info *info)
{
}
static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
{
return 0;
}
static inline void ceph_forget_all_cached_acls(struct inode *inode)
{
}
#endif
/* caps.c */
extern const char *ceph_cap_string(int c);
extern void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_msg *msg);
extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
extern void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
int fmode, unsigned issued, unsigned wanted,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
extern int ceph_is_any_caps(struct inode *inode);
extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
u64 cap_id, u32 migrate_seq, u32 issue_seq);
extern void ceph_queue_caps_release(struct inode *inode);
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
int mds);
extern int ceph_get_cap_mds(struct inode *inode);
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession,
int again);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
int mds, int drop, int unless, int force);
extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
int mds, int drop, int unless);
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
int *got, loff_t endoff);
/* for counting open files by mode */
static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
{
ci->i_nr_by_mode[mode]++;
}
extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
/* addr.c */
extern const struct address_space_operations ceph_aops;
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
/* file.c */
extern const struct file_operations ceph_file_fops;
extern const struct address_space_operations ceph_aops;
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode,
int *opened);
extern int ceph_release(struct inode *inode, struct file *filp);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
extern const struct inode_operations ceph_dir_iops;
extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
ceph_snapdir_dentry_ops;
extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
extern int ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry, int err);
extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err);
extern void ceph_dentry_lru_add(struct dentry *dn);
extern void ceph_dentry_lru_touch(struct dentry *dn);
extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
/*
* our d_ops vary depending on whether the inode is live,
* snapshotted (read-only), or a virtual ".snap" directory.
*/
int ceph_init_dentry(struct dentry *dentry);
/* ioctl.c */
extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/* export.c */
extern const struct export_operations ceph_export_ops;
/* locks.c */
extern __init void ceph_flock_init(void);
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
extern int ceph_encode_locks_to_buffer(struct inode *inode,
struct ceph_filelock *flocks,
int num_fcntl_locks,
int num_flock_locks);
extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
struct ceph_pagelist *pagelist,
int num_fcntl_locks, int num_flock_locks);
extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
/* debugfs.c */
extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
#endif /* _FS_CEPH_SUPER_H */

1105
fs/ceph/xattr.c Normal file

File diff suppressed because it is too large Load diff