Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

75
fs/ext4/Kconfig Normal file
View file

@ -0,0 +1,75 @@
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
select JBD2
select CRC16
select CRYPTO
select CRYPTO_CRC32C
help
This is the next generation of the ext3 filesystem.
Unlike the change from ext2 filesystem to ext3 filesystem,
the on-disk format of ext4 is not forwards compatible with
ext3; it is based on extent maps and it supports 48-bit
physical block numbers. The ext4 filesystem also supports delayed
allocation, persistent preallocation, high resolution time stamps,
and a number of other features to improve performance and speed
up fsck time. For more information, please see the web pages at
http://ext4.wiki.kernel.org.
The ext4 filesystem will support mounting an ext3
filesystem; while there will be some performance gains from
the delayed allocation and inode table readahead, the best
performance gains will require enabling ext4 features in the
filesystem, or formatting a new filesystem as an ext4
filesystem initially.
To compile this file system support as a module, choose M here. The
module will be called ext4.
If unsure, say N.
config EXT4_USE_FOR_EXT23
bool "Use ext4 for ext2/ext3 file systems"
depends on EXT4_FS
depends on EXT3_FS=n || EXT2_FS=n
default y
help
Allow the ext4 file system driver code to be used for ext2 or
ext3 file system mounts. This allows users to reduce their
compiled kernel size by using one file system driver for
ext2, ext3, and ext4 file systems.
config EXT4_FS_POSIX_ACL
bool "Ext4 POSIX Access Control Lists"
depends on EXT4_FS
select FS_POSIX_ACL
help
POSIX Access Control Lists (ACLs) support permissions for users and
groups beyond the owner/group/world scheme.
To learn more about Access Control Lists, visit the POSIX ACLs for
Linux website <http://acl.bestbits.at/>.
If you don't know what Access Control Lists are, say N
config EXT4_FS_SECURITY
bool "Ext4 Security Labels"
depends on EXT4_FS
help
Security labels support alternative access control models
implemented by security modules like SELinux. This option
enables an extended attribute handler for file security
labels in the ext4 filesystem.
If you are not using a security module that requires using
extended attributes for file security labels, say N.
config EXT4_DEBUG
bool "EXT4 debugging support"
depends on EXT4_FS
help
Enables run-time debugging support for the ext4 filesystem.
If you select Y here, then you will be able to turn on debugging
with a command such as:
echo 1 > /sys/module/ext4/parameters/mballoc_debug

18
fs/ext4/Makefile Normal file
View file

@ -0,0 +1,18 @@
#
# Makefile for the linux ext4-filesystem routines.
#
ifeq ($(SEC_BUILD_CONF_QUICK_DMVERITY),true)
EXTRA_CFLAGS += -DVERIFY_META_ONLY=true
endif
obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
xattr_trusted.o inline.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o

288
fs/ext4/acl.c Normal file
View file

@ -0,0 +1,288 @@
/*
* linux/fs/ext4/acl.c
*
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*/
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
#include "acl.h"
/*
* Convert from filesystem to in-memory representation.
*/
static struct posix_acl *
ext4_acl_from_disk(const void *value, size_t size)
{
const char *end = (char *)value + size;
int n, count;
struct posix_acl *acl;
if (!value)
return NULL;
if (size < sizeof(ext4_acl_header))
return ERR_PTR(-EINVAL);
if (((ext4_acl_header *)value)->a_version !=
cpu_to_le32(EXT4_ACL_VERSION))
return ERR_PTR(-EINVAL);
value = (char *)value + sizeof(ext4_acl_header);
count = ext4_acl_count(size);
if (count < 0)
return ERR_PTR(-EINVAL);
if (count == 0)
return NULL;
acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
for (n = 0; n < count; n++) {
ext4_acl_entry *entry =
(ext4_acl_entry *)value;
if ((char *)value + sizeof(ext4_acl_entry_short) > end)
goto fail;
acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
switch (acl->a_entries[n].e_tag) {
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
value = (char *)value +
sizeof(ext4_acl_entry_short);
break;
case ACL_USER:
value = (char *)value + sizeof(ext4_acl_entry);
if ((char *)value > end)
goto fail;
acl->a_entries[n].e_uid =
make_kuid(&init_user_ns,
le32_to_cpu(entry->e_id));
break;
case ACL_GROUP:
value = (char *)value + sizeof(ext4_acl_entry);
if ((char *)value > end)
goto fail;
acl->a_entries[n].e_gid =
make_kgid(&init_user_ns,
le32_to_cpu(entry->e_id));
break;
default:
goto fail;
}
}
if (value != end)
goto fail;
return acl;
fail:
posix_acl_release(acl);
return ERR_PTR(-EINVAL);
}
/*
* Convert from in-memory to filesystem representation.
*/
static void *
ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
{
ext4_acl_header *ext_acl;
char *e;
size_t n;
*size = ext4_acl_size(acl->a_count);
ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
sizeof(ext4_acl_entry), GFP_NOFS);
if (!ext_acl)
return ERR_PTR(-ENOMEM);
ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
e = (char *)ext_acl + sizeof(ext4_acl_header);
for (n = 0; n < acl->a_count; n++) {
const struct posix_acl_entry *acl_e = &acl->a_entries[n];
ext4_acl_entry *entry = (ext4_acl_entry *)e;
entry->e_tag = cpu_to_le16(acl_e->e_tag);
entry->e_perm = cpu_to_le16(acl_e->e_perm);
switch (acl_e->e_tag) {
case ACL_USER:
entry->e_id = cpu_to_le32(
from_kuid(&init_user_ns, acl_e->e_uid));
e += sizeof(ext4_acl_entry);
break;
case ACL_GROUP:
entry->e_id = cpu_to_le32(
from_kgid(&init_user_ns, acl_e->e_gid));
e += sizeof(ext4_acl_entry);
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
e += sizeof(ext4_acl_entry_short);
break;
default:
goto fail;
}
}
return (char *)ext_acl;
fail:
kfree(ext_acl);
return ERR_PTR(-EINVAL);
}
/*
* Inode operation get_posix_acl().
*
* inode->i_mutex: don't care
*/
struct posix_acl *
ext4_get_acl(struct inode *inode, int type)
{
int name_index;
char *value = NULL;
struct posix_acl *acl;
int retval;
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
break;
default:
BUG();
}
retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
if (retval > 0) {
value = kmalloc(retval, GFP_NOFS);
if (!value)
return ERR_PTR(-ENOMEM);
retval = ext4_xattr_get(inode, name_index, "", value, retval);
}
if (retval > 0)
acl = ext4_acl_from_disk(value, retval);
else if (retval == -ENODATA || retval == -ENOSYS)
acl = NULL;
else
acl = ERR_PTR(retval);
kfree(value);
if (!IS_ERR(acl))
set_cached_acl(inode, type, acl);
return acl;
}
/*
* Set the access or default ACL of an inode.
*
* inode->i_mutex: down unless called from ext4_new_inode
*/
static int
__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
struct posix_acl *acl)
{
int name_index;
void *value = NULL;
size_t size = 0;
int error;
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
return error;
else {
inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
if (error == 0)
acl = NULL;
}
}
break;
case ACL_TYPE_DEFAULT:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
break;
default:
return -EINVAL;
}
if (acl) {
value = ext4_acl_to_disk(acl, &size);
if (IS_ERR(value))
return (int)PTR_ERR(value);
}
error = ext4_xattr_set_handle(handle, inode, name_index, "",
value, size, 0);
kfree(value);
if (!error)
set_cached_acl(inode, type, acl);
return error;
}
int
ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
handle_t *handle;
int error, retries = 0;
retry:
handle = ext4_journal_start(inode, EXT4_HT_XATTR,
ext4_jbd2_credits_xattr(inode));
if (IS_ERR(handle))
return PTR_ERR(handle);
error = __ext4_set_acl(handle, inode, type, acl);
ext4_journal_stop(handle);
if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
return error;
}
/*
* Initialize the ACLs of a new inode. Called from ext4_new_inode.
*
* dir->i_mutex: down
* inode->i_mutex: up (access to inode is still exclusive)
*/
int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
struct posix_acl *default_acl, *acl;
int error;
error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (error)
return error;
if (default_acl) {
error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
default_acl);
posix_acl_release(default_acl);
}
if (acl) {
if (!error)
error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
acl);
posix_acl_release(acl);
}
return error;
}

72
fs/ext4/acl.h Normal file
View file

@ -0,0 +1,72 @@
/*
File: fs/ext4/acl.h
(C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
#include <linux/posix_acl_xattr.h>
#define EXT4_ACL_VERSION 0x0001
typedef struct {
__le16 e_tag;
__le16 e_perm;
__le32 e_id;
} ext4_acl_entry;
typedef struct {
__le16 e_tag;
__le16 e_perm;
} ext4_acl_entry_short;
typedef struct {
__le32 a_version;
} ext4_acl_header;
static inline size_t ext4_acl_size(int count)
{
if (count <= 4) {
return sizeof(ext4_acl_header) +
count * sizeof(ext4_acl_entry_short);
} else {
return sizeof(ext4_acl_header) +
4 * sizeof(ext4_acl_entry_short) +
(count - 4) * sizeof(ext4_acl_entry);
}
}
static inline int ext4_acl_count(size_t size)
{
ssize_t s;
size -= sizeof(ext4_acl_header);
s = size - 4 * sizeof(ext4_acl_entry_short);
if (s < 0) {
if (size % sizeof(ext4_acl_entry_short))
return -1;
return size / sizeof(ext4_acl_entry_short);
} else {
if (s % sizeof(ext4_acl_entry))
return -1;
return s / sizeof(ext4_acl_entry) + 4;
}
}
#ifdef CONFIG_EXT4_FS_POSIX_ACL
/* acl.c */
struct posix_acl *ext4_get_acl(struct inode *inode, int type);
int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT4_FS_POSIX_ACL */
#include <linux/sched.h>
#define ext4_get_acl NULL
#define ext4_set_acl NULL
static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
return 0;
}
#endif /* CONFIG_EXT4_FS_POSIX_ACL */

883
fs/ext4/balloc.c Normal file
View file

@ -0,0 +1,883 @@
/*
* linux/fs/ext4/balloc.c
*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
*
* Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
* Big-endian to little-endian byte-swapping/bitmaps by
* David S. Miller (davem@caip.rutgers.edu), 1995
*/
#include <linux/time.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "mballoc.h"
#include <trace/events/ext4.h>
static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
ext4_group_t block_group);
/*
* balloc.c contains the blocks allocation and deallocation routines
*/
/*
* Calculate block group number for a given block number
*/
ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_fsblk_t block)
{
ext4_group_t group;
if (test_opt2(sb, STD_GROUP_SIZE))
group = (block -
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
else
ext4_get_group_no_and_offset(sb, block, &group, NULL);
return group;
}
/*
* Calculate the block group number and offset into the block/cluster
* allocation bitmap, given a block number
*/
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
ext4_grpblk_t offset;
blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
EXT4_SB(sb)->s_cluster_bits;
if (offsetp)
*offsetp = offset;
if (blockgrpp)
*blockgrpp = blocknr;
}
/*
* Check whether the 'block' lives within the 'block_group'. Returns 1 if so
* and 0 otherwise.
*/
static inline int ext4_block_in_group(struct super_block *sb,
ext4_fsblk_t block,
ext4_group_t block_group)
{
ext4_group_t actual_group;
actual_group = ext4_get_group_number(sb, block);
return (actual_group == block_group) ? 1 : 0;
}
/* Return the number of clusters used for file system metadata; this
* represents the overhead needed by the file system.
*/
static unsigned ext4_num_overhead_clusters(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
unsigned num_clusters;
int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
ext4_fsblk_t itbl_blk;
struct ext4_sb_info *sbi = EXT4_SB(sb);
/* This is the number of clusters used by the superblock,
* block group descriptors, and reserved block group
* descriptor blocks */
num_clusters = ext4_num_base_meta_clusters(sb, block_group);
/*
* For the allocation bitmaps and inode table, we first need
* to check to see if the block is in the block group. If it
* is, then check to see if the cluster is already accounted
* for in the clusters used for the base metadata cluster, or
* if we can increment the base metadata cluster to include
* that block. Otherwise, we will have to track the cluster
* used for the allocation bitmap or inode table explicitly.
* Normally all of these blocks are contiguous, so the special
* case handling shouldn't be necessary except for *very*
* unusual file system layouts.
*/
if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
block_cluster = EXT4_B2C(sbi,
ext4_block_bitmap(sb, gdp) - start);
if (block_cluster < num_clusters)
block_cluster = -1;
else if (block_cluster == num_clusters) {
num_clusters++;
block_cluster = -1;
}
}
if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
inode_cluster = EXT4_B2C(sbi,
ext4_inode_bitmap(sb, gdp) - start);
if (inode_cluster < num_clusters)
inode_cluster = -1;
else if (inode_cluster == num_clusters) {
num_clusters++;
inode_cluster = -1;
}
}
itbl_blk = ext4_inode_table(sb, gdp);
for (i = 0; i < sbi->s_itb_per_group; i++) {
if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
c = EXT4_B2C(sbi, itbl_blk + i - start);
if ((c < num_clusters) || (c == inode_cluster) ||
(c == block_cluster) || (c == itbl_cluster))
continue;
if (c == num_clusters) {
num_clusters++;
continue;
}
num_clusters++;
itbl_cluster = c;
}
}
if (block_cluster != -1)
num_clusters++;
if (inode_cluster != -1)
num_clusters++;
return num_clusters;
}
static unsigned int num_clusters_in_group(struct super_block *sb,
ext4_group_t block_group)
{
unsigned int blocks;
if (block_group == ext4_get_groups_count(sb) - 1) {
/*
* Even though mke2fs always initializes the first and
* last group, just in case some other tool was used,
* we need to make sure we calculate the right free
* blocks.
*/
blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
ext4_group_first_block_no(sb, block_group);
} else
blocks = EXT4_BLOCKS_PER_GROUP(sb);
return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
}
/* Initializes an uninitialized block bitmap */
static int ext4_init_block_bitmap(struct super_block *sb,
struct buffer_head *bh,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
unsigned int bit, bit_max;
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start, tmp;
int flex_bg = 0;
struct ext4_group_info *grp;
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
int count;
count = ext4_free_inodes_count(sb, gdp);
percpu_counter_sub(&sbi->s_freeinodes_counter,
count);
}
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return -EIO;
}
memset(bh->b_data, 0, sb->s_blocksize);
bit_max = ext4_num_base_meta_clusters(sb, block_group);
for (bit = 0; bit < bit_max; bit++)
ext4_set_bit(bit, bh->b_data);
start = ext4_group_first_block_no(sb, block_group);
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
flex_bg = 1;
/* Set bits for block and inode bitmaps, and inode table */
tmp = ext4_block_bitmap(sb, gdp);
if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
tmp = ext4_inode_bitmap(sb, gdp);
if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
tmp = ext4_inode_table(sb, gdp);
for (; tmp < ext4_inode_table(sb, gdp) +
sbi->s_itb_per_group; tmp++) {
if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
}
/*
* Also if the number of blocks within the group is less than
* the blocksize * 8 ( which is the size of bitmap ), set rest
* of the block bitmap to 1
*/
ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
sb->s_blocksize * 8, bh->b_data);
ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
return 0;
}
/* Return the number of free blocks in a block group. It is used when
* the block bitmap is uninitialized, so we can't just count the bits
* in the bitmap. */
unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
return num_clusters_in_group(sb, block_group) -
ext4_num_overhead_clusters(sb, block_group, gdp);
}
/*
* The free blocks are managed by bitmaps. A file system contains several
* blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
* block for inodes, N blocks for the inode table and data blocks.
*
* The file system contains group descriptors which are located after the
* super block. Each descriptor contains the number of the bitmap block and
* the free blocks count in the block. The descriptors are loaded in memory
* when a file system is mounted (see ext4_fill_super).
*/
/**
* ext4_get_group_desc() -- load group descriptor from disk
* @sb: super block
* @block_group: given block group
* @bh: pointer to the buffer head to store the block
* group descriptor
*/
struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head **bh)
{
unsigned int group_desc;
unsigned int offset;
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (block_group >= ngroups) {
ext4_error(sb, "block_group >= groups_count - block_group = %u,"
" groups_count = %u", block_group, ngroups);
return NULL;
}
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
if (!sbi->s_group_desc[group_desc]) {
ext4_error(sb, "Group descriptor not loaded - "
"block_group = %u, group_desc = %u, desc = %u",
block_group, group_desc, offset);
return NULL;
}
desc = (struct ext4_group_desc *)(
(__u8 *)sbi->s_group_desc[group_desc]->b_data +
offset * EXT4_DESC_SIZE(sb));
if (bh)
*bh = sbi->s_group_desc[group_desc];
return desc;
}
/*
* Return the block number which was discovered to be invalid, or 0 if
* the block bitmap is valid.
*/
static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
ext4_group_t block_group,
struct buffer_head *bh)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t offset;
ext4_grpblk_t next_zero_bit;
ext4_fsblk_t blk;
ext4_fsblk_t group_first_block;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
/* with FLEX_BG, the inode/block bitmaps and itable
* blocks may not be in the group at all
* so the bitmap validation will be skipped for those groups
* or it has to also read the block group where the bitmaps
* are located to verify they are set.
*/
return 0;
}
group_first_block = ext4_group_first_block_no(sb, block_group);
/* check whether block bitmap block number is set */
blk = ext4_block_bitmap(sb, desc);
offset = blk - group_first_block;
if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
return blk;
/* check whether the inode bitmap block number is set */
blk = ext4_inode_bitmap(sb, desc);
offset = blk - group_first_block;
if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
return blk;
/* check whether the inode table block number is set */
blk = ext4_inode_table(sb, desc);
offset = blk - group_first_block;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group),
EXT4_B2C(sbi, offset));
if (next_zero_bit <
EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group))
/* bad bitmap for inode tables */
return blk;
return 0;
}
static void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
ext4_group_t block_group,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (buffer_verified(bh))
return;
ext4_lock_group(sb, block_group);
blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
if (unlikely(blk != 0)) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
block_group, blk);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
desc, bh))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
}
/**
* ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
*
* Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps
*
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc *desc;
struct buffer_head *bh;
ext4_fsblk_t bitmap_blk;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
return NULL;
bitmap_blk = ext4_block_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
ext4_error(sb, "Cannot get buffer for block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, bitmap_blk);
return NULL;
}
if (bitmap_uptodate(bh))
goto verify;
lock_buffer(bh);
if (bitmap_uptodate(bh)) {
unlock_buffer(bh);
goto verify;
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
int err;
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
if (err)
ext4_error(sb, "Checksum bad for grp %u", block_group);
return bh;
}
ext4_unlock_group(sb, block_group);
if (buffer_uptodate(bh)) {
/*
* if not uninit if bh is uptodate,
* bitmap is also uptodate
*/
set_bitmap_uptodate(bh);
unlock_buffer(bh);
goto verify;
}
/*
* submit the buffer_head for reading
*/
set_buffer_new(bh);
trace_ext4_read_block_bitmap_load(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
submit_bh(READ | REQ_META | REQ_PRIO, bh);
return bh;
verify:
ext4_validate_block_bitmap(sb, desc, block_group, bh);
if (buffer_verified(bh))
return bh;
put_bh(bh);
return NULL;
}
/* Returns 0 on success, 1 on error */
int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
struct buffer_head *bh)
{
struct ext4_group_desc *desc;
if (!buffer_new(bh))
return 0;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
return 1;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, (unsigned long long) bh->b_blocknr);
return 1;
}
clear_buffer_new(bh);
/* Panic or remount fs read-only if block bitmap is invalid */
ext4_validate_block_bitmap(sb, desc, block_group, bh);
/* ...but check for error just in case errors=continue. */
return !buffer_verified(bh);
}
struct buffer_head *
ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct buffer_head *bh;
bh = ext4_read_block_bitmap_nowait(sb, block_group);
if (!bh)
return NULL;
if (ext4_wait_block_bitmap(sb, block_group, bh)) {
put_bh(bh);
return NULL;
}
return bh;
}
/**
* ext4_has_free_clusters()
* @sbi: in-core super block structure.
* @nclusters: number of needed blocks
* @flags: flags from ext4_mb_new_blocks()
*
* Check if filesystem has nclusters free & available for allocation.
* On success return 1, return 0 on failure.
*/
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags)
{
s64 free_clusters, dirty_clusters, rsv, resv_clusters;
struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
free_clusters = percpu_counter_read_positive(fcc);
dirty_clusters = percpu_counter_read_positive(dcc);
resv_clusters = atomic64_read(&sbi->s_resv_clusters);
/*
* r_blocks_count should always be multiple of the cluster ratio so
* we are safe to do a plane bit shift only.
*/
rsv = (atomic64_read(&sbi->s_r_blocks_count) >> sbi->s_cluster_bits) +
resv_clusters;
if (free_clusters - (nclusters + rsv + dirty_clusters) <
EXT4_FREECLUSTERS_WATERMARK) {
free_clusters = percpu_counter_sum_positive(fcc);
dirty_clusters = percpu_counter_sum_positive(dcc);
}
/* Check whether we have space after accounting for current
* dirty clusters & root reserved clusters.
*/
if (free_clusters >= (rsv + nclusters + dirty_clusters))
return 1;
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE) ||
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
return 1;
}
/* No free blocks. Let's see if we can dip into reserved pool */
if (flags & EXT4_MB_USE_RESERVED) {
if (free_clusters >= (nclusters + dirty_clusters))
return 1;
}
return 0;
}
int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags)
{
if (ext4_has_free_clusters(sbi, nclusters, flags)) {
percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
return 0;
} else
return -ENOSPC;
}
/**
* ext4_should_retry_alloc()
* @sb: super block
* @retries number of attemps has been made
*
* ext4_should_retry_alloc() is called when ENOSPC is returned, and if
* it is profitable to retry the operation, this function will wait
* for the current or committing transaction to complete, and then
* return TRUE.
*
* if the total number of retries exceed three times, return FALSE.
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
(*retries)++ > 3 ||
!EXT4_SB(sb)->s_journal)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
}
/*
* ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
*
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
* @count: pointer to total number of clusters needed
* @errp: error code
*
* Return 1st allocated block number on success, *count stores total account
* error stores in errp pointer
*/
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned int flags,
unsigned long *count, int *errp)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
memset(&ar, 0, sizeof(ar));
/* Fill with neighbour allocated blocks */
ar.inode = inode;
ar.goal = goal;
ar.len = count ? *count : 1;
ar.flags = flags;
ret = ext4_mb_new_blocks(handle, &ar, errp);
if (count)
*count = ar.len;
/*
* Account for the allocated meta blocks. We will never
* fail EDQUOT for metdata, but we do account for it.
*/
if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
dquot_alloc_block_nofail(inode,
EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
}
return ret;
}
/**
* ext4_count_free_clusters() -- count filesystem free clusters
* @sb: superblock
*
* Adds up the number of free clusters from each block group.
*/
ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
{
ext4_fsblk_t desc_count;
struct ext4_group_desc *gdp;
ext4_group_t i;
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_info *grp;
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
ext4_fsblk_t bitmap_count;
unsigned int x;
struct buffer_head *bitmap_bh = NULL;
es = EXT4_SB(sb)->s_es;
desc_count = 0;
bitmap_count = 0;
gdp = NULL;
for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
grp = NULL;
if (EXT4_SB(sb)->s_group_info)
grp = ext4_get_group_info(sb, i);
if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
desc_count += ext4_free_group_clusters(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
x = ext4_count_free(bitmap_bh->b_data,
EXT4_CLUSTERS_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
i, ext4_free_group_clusters(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
", computed = %llu, %llu\n",
EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
desc_count, bitmap_count);
return bitmap_count;
#else
desc_count = 0;
for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
grp = NULL;
if (EXT4_SB(sb)->s_group_info)
grp = ext4_get_group_info(sb, i);
if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
desc_count += ext4_free_group_clusters(sb, gdp);
}
return desc_count;
#endif
}
static inline int test_root(ext4_group_t a, int b)
{
while (1) {
if (a < b)
return 0;
if (a == b)
return 1;
if ((a % b) != 0)
return 0;
a = a / b;
}
}
/**
* ext4_bg_has_super - number of blocks used by the superblock in group
* @sb: superblock for filesystem
* @group: group number to check
*
* Return the number of blocks used by the superblock (primary or backup)
* in this group. Currently this will be only 0 or 1.
*/
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
if (group == 0)
return 1;
if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
group == le32_to_cpu(es->s_backup_bgs[1]))
return 1;
return 0;
}
if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
return 1;
if (!(group & 1))
return 0;
if (test_root(group, 3) || (test_root(group, 5)) ||
test_root(group, 7))
return 1;
return 0;
}
static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
ext4_group_t group)
{
unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
if (group == first || group == first + 1 || group == last)
return 1;
return 0;
}
static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
ext4_group_t group)
{
if (!ext4_bg_has_super(sb, group))
return 0;
if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
else
return EXT4_SB(sb)->s_gdb_count;
}
/**
* ext4_bg_num_gdb - number of blocks used by the group table in group
* @sb: superblock for filesystem
* @group: group number to check
*
* Return the number of blocks used by the group descriptor table
* (primary or backup) in this group. In the future there may be a
* different number of descriptor blocks in each group.
*/
unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
{
unsigned long first_meta_bg =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
metagroup < first_meta_bg)
return ext4_bg_num_gdb_nometa(sb, group);
return ext4_bg_num_gdb_meta(sb,group);
}
/*
* This function returns the number of file system metadata clusters at
* the beginning of a block group, including the reserved gdt blocks.
*/
static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
ext4_group_t block_group)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned num;
/* Check for superblock and gdt backups in this group */
num = ext4_bg_has_super(sb, block_group);
if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
sbi->s_desc_per_block) {
if (num) {
num += ext4_bg_num_gdb(sb, block_group);
num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
num += ext4_bg_num_gdb(sb, block_group);
}
return EXT4_NUM_B2C(sbi, num);
}
/**
* ext4_inode_to_goal_block - return a hint for block allocation
* @inode: inode for block allocation
*
* Return the ideal location to start allocating blocks for a
* newly created inode.
*/
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
ext4_group_t block_group;
ext4_grpblk_t colour;
int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
block_group = ei->i_block_group;
if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
/*
* If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
* block groups per flexgroup, reserve the first block
* group for directories and special files. Regular
* files will start at the second block group. This
* tends to speed up directory access and improves
* fsck times.
*/
block_group &= ~(flex_size-1);
if (S_ISREG(inode->i_mode))
block_group++;
}
bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
/*
* If we are doing delayed allocation, we don't need take
* colour into account.
*/
if (test_opt(inode->i_sb, DELALLOC))
return bg_start;
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
else
colour = (current->pid % 16) * ((last_block - bg_start) / 16);
return bg_start + colour;
}

98
fs/ext4/bitmap.c Normal file
View file

@ -0,0 +1,98 @@
/*
* linux/fs/ext4/bitmap.c
*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
*/
#include <linux/buffer_head.h>
#include <linux/jbd2.h>
#include "ext4.h"
unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
{
return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
}
int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *gdp,
struct buffer_head *bh, int sz)
{
__u32 hi;
__u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (!ext4_has_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
provided |= (hi << 16);
} else
calculated &= 0xFFFF;
return provided == calculated;
}
void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *gdp,
struct buffer_head *bh, int sz)
{
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (!ext4_has_metadata_csum(sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}
int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *gdp,
struct buffer_head *bh)
{
__u32 hi;
__u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
if (!ext4_has_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
provided |= (hi << 16);
} else
calculated &= 0xFFFF;
if (provided == calculated)
return 1;
return 0;
}
void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *gdp,
struct buffer_head *bh)
{
int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (!ext4_has_metadata_csum(sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}

253
fs/ext4/block_validity.c Normal file
View file

@ -0,0 +1,253 @@
/*
* linux/fs/ext4/block_validity.c
*
* Copyright (C) 2009
* Theodore Ts'o (tytso@mit.edu)
*
* Track which blocks in the filesystem are metadata blocks that
* should never be used as data blocks by files or directories.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include "ext4.h"
struct ext4_system_zone {
struct rb_node node;
ext4_fsblk_t start_blk;
unsigned int count;
};
static struct kmem_cache *ext4_system_zone_cachep;
int __init ext4_init_system_zone(void)
{
ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
if (ext4_system_zone_cachep == NULL)
return -ENOMEM;
return 0;
}
void ext4_exit_system_zone(void)
{
kmem_cache_destroy(ext4_system_zone_cachep);
}
static inline int can_merge(struct ext4_system_zone *entry1,
struct ext4_system_zone *entry2)
{
if ((entry1->start_blk + entry1->count) == entry2->start_blk)
return 1;
return 0;
}
/*
* Mark a range of blocks as belonging to the "system zone" --- that
* is, filesystem metadata blocks which should never be used by
* inodes.
*/
static int add_system_zone(struct ext4_sb_info *sbi,
ext4_fsblk_t start_blk,
unsigned int count)
{
struct ext4_system_zone *new_entry = NULL, *entry;
struct rb_node **n = &sbi->system_blks.rb_node, *node;
struct rb_node *parent = NULL, *new_node = NULL;
while (*n) {
parent = *n;
entry = rb_entry(parent, struct ext4_system_zone, node);
if (start_blk < entry->start_blk)
n = &(*n)->rb_left;
else if (start_blk >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
else {
if (start_blk + count > (entry->start_blk +
entry->count))
entry->count = (start_blk + count -
entry->start_blk);
new_node = *n;
new_entry = rb_entry(new_node, struct ext4_system_zone,
node);
break;
}
}
if (!new_entry) {
new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
GFP_KERNEL);
if (!new_entry)
return -ENOMEM;
new_entry->start_blk = start_blk;
new_entry->count = count;
new_node = &new_entry->node;
rb_link_node(new_node, parent, n);
rb_insert_color(new_node, &sbi->system_blks);
}
/* Can we merge to the left? */
node = rb_prev(new_node);
if (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
if (can_merge(entry, new_entry)) {
new_entry->start_blk = entry->start_blk;
new_entry->count += entry->count;
rb_erase(node, &sbi->system_blks);
kmem_cache_free(ext4_system_zone_cachep, entry);
}
}
/* Can we merge to the right? */
node = rb_next(new_node);
if (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
if (can_merge(new_entry, entry)) {
new_entry->count += entry->count;
rb_erase(node, &sbi->system_blks);
kmem_cache_free(ext4_system_zone_cachep, entry);
}
}
return 0;
}
static void debug_print_tree(struct ext4_sb_info *sbi)
{
struct rb_node *node;
struct ext4_system_zone *entry;
int first = 1;
printk(KERN_INFO "System zones: ");
node = rb_first(&sbi->system_blks);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
printk("%s%llu-%llu", first ? "" : ", ",
entry->start_blk, entry->start_blk + entry->count - 1);
first = 0;
node = rb_next(node);
}
printk("\n");
}
#ifdef VERIFY_META_ONLY
struct rb_root *ext4_system_zone_root(struct super_block *sb)
{
return &EXT4_SB(sb)->system_blks;
}
#endif
int ext4_setup_system_zone(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp;
ext4_group_t i;
int flex_size = ext4_flex_bg_size(sbi);
int ret;
if (!test_opt(sb, BLOCK_VALIDITY)) {
if (EXT4_SB(sb)->system_blks.rb_node)
ext4_release_system_zone(sb);
return 0;
}
if (EXT4_SB(sb)->system_blks.rb_node)
return 0;
for (i=0; i < ngroups; i++) {
if (ext4_bg_has_super(sb, i) &&
((i < 5) || ((i % flex_size) == 0)))
add_system_zone(sbi, ext4_group_first_block_no(sb, i),
ext4_bg_num_gdb(sb, i) + 1);
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
if (ret)
return ret;
ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
if (ret)
return ret;
ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
sbi->s_itb_per_group);
if (ret)
return ret;
}
if (test_opt(sb, DEBUG))
debug_print_tree(EXT4_SB(sb));
return 0;
}
/* Called when the filesystem is unmounted */
void ext4_release_system_zone(struct super_block *sb)
{
struct ext4_system_zone *entry, *n;
rbtree_postorder_for_each_entry_safe(entry, n,
&EXT4_SB(sb)->system_blks, node)
kmem_cache_free(ext4_system_zone_cachep, entry);
EXT4_SB(sb)->system_blks = RB_ROOT;
}
/*
* Returns 1 if the passed-in block region (start_blk,
* start_blk+count) is valid; 0 if some part of the block region
* overlaps with filesystem metadata blocks.
*/
int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
unsigned int count)
{
#ifndef VERIFY_META_ONLY
struct ext4_system_zone *entry;
struct rb_node *n = sbi->system_blks.rb_node;
#endif
if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
(start_blk + count < start_blk) ||
(start_blk + count > ext4_blocks_count(sbi->s_es))) {
sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
return 0;
}
#ifndef VERIFY_META_ONLY
while (n) {
entry = rb_entry(n, struct ext4_system_zone, node);
if (start_blk + count - 1 < entry->start_blk)
n = n->rb_left;
else if (start_blk >= (entry->start_blk + entry->count))
n = n->rb_right;
else {
sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
return 0;
}
}
#endif
return 1;
}
int ext4_check_blockref(const char *function, unsigned int line,
struct inode *inode, __le32 *p, unsigned int max)
{
struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
__le32 *bref = p;
unsigned int blk;
while (bref < p+max) {
blk = le32_to_cpu(*bref++);
if (blk &&
unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
blk, 1))) {
es->s_last_error_block = cpu_to_le64(blk);
ext4_error_inode(inode, function, line, blk,
"invalid block");
return -EIO;
}
}
return 0;
}

611
fs/ext4/dir.c Normal file
View file

@ -0,0 +1,611 @@
/*
* linux/fs/ext4/dir.c
*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
*
* from
*
* linux/fs/minix/dir.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* ext4 directory handling functions
*
* Big-endian to little-endian byte-swapping/bitmaps by
* David S. Miller (davem@caip.rutgers.edu), 1995
*
* Hash Tree Directory indexing (c) 2001 Daniel Phillips
*
*/
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include "ext4.h"
#include "xattr.h"
static int ext4_dx_readdir(struct file *, struct dir_context *);
/**
* Check if the given dir-inode refers to an htree-indexed directory
* (or a directory which could potentially get converted to use htree
* indexing).
*
* Return 1 if it is a dx dir, 0 if not
*/
static int is_dx_dir(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1) ||
ext4_has_inline_data(inode)))
return 1;
return 0;
}
/*
* Return 0 if the directory entry is OK, and 1 if there is a problem
*
* Note: this is the opposite of what ext2 and ext3 historically returned...
*
* bh passed here can be an inode block or a dir data block, depending
* on the inode inline data flag.
*/
int __ext4_check_dir_entry(const char *function, unsigned int line,
struct inode *dir, struct file *filp,
struct ext4_dir_entry_2 *de,
struct buffer_head *bh, char *buf, int size,
unsigned int offset)
{
const char *error_msg = NULL;
const int rlen = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
else if (unlikely(((char *) de - buf) + rlen > size))
error_msg = "directory entry across range";
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
else
return 0;
/* for debugging, sangwoo2.lee */
print_bh(dir->i_sb, bh, 0, EXT4_BLOCK_SIZE(dir->i_sb));
/* for debugging */
if (filp)
ext4_error_file(filp, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u(%u), "
"inode=%u, rec_len=%d, name_len=%d",
error_msg, (unsigned) (offset % size),
offset, le32_to_cpu(de->inode),
rlen, de->name_len);
else
ext4_error_inode(dir, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u(%u), "
"inode=%u, rec_len=%d, name_len=%d",
error_msg, (unsigned) (offset % size),
offset, le32_to_cpu(de->inode),
rlen, de->name_len);
return 1;
}
static int ext4_readdir(struct file *file, struct dir_context *ctx)
{
unsigned int offset;
int i;
struct ext4_dir_entry_2 *de;
int err;
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
int dir_has_error = 0;
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(file, ctx);
if (err != ERR_BAD_DX_DIR) {
return err;
}
/*
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
ext4_clear_inode_flag(file_inode(file),
EXT4_INODE_INDEX);
}
if (ext4_has_inline_data(inode)) {
int has_inline_data = 1;
int ret = ext4_read_inline_dir(file, ctx,
&has_inline_data);
if (has_inline_data)
return ret;
}
offset = ctx->pos & (sb->s_blocksize - 1);
while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
struct buffer_head *bh = NULL;
map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
err = ext4_map_blocks(NULL, inode, &map, 0);
if (err > 0) {
pgoff_t index = map.m_pblk >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
&file->f_ra, file,
index, 1);
file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
}
if (!bh) {
if (!dir_has_error) {
EXT4_ERROR_FILE(file, 0,
"directory contains a "
"hole at offset %llu",
(unsigned long long) ctx->pos);
dir_has_error = 1;
}
/* corrupt size? Maybe no more blocks to read */
if (ctx->pos > inode->i_blocks << 9)
break;
ctx->pos += sb->s_blocksize - offset;
continue;
}
/* Check the checksum */
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(inode,
(struct ext4_dir_entry *)bh->b_data)) {
EXT4_ERROR_FILE(file, 0, "directory fails checksum "
"at offset %llu",
(unsigned long long)ctx->pos);
ctx->pos += sb->s_blocksize - offset;
brelse(bh);
continue;
}
set_buffer_verified(bh);
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
if (file->f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext4_dir_entry_2 *)
(bh->b_data + i);
/* It's too expensive to do a full
* dirent test each time round this
* loop, but we do have to test at
* least that it is non-zero. A
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
break;
i += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
}
offset = i;
ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
file->f_version = inode->i_version;
}
while (ctx->pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
if (ext4_check_dir_entry(inode, file, de, bh,
bh->b_data, bh->b_size,
offset)) {
/*
* On error, skip to the next block
*/
ctx->pos = (ctx->pos |
(sb->s_blocksize - 1)) + 1;
break;
}
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
if (!dir_emit(ctx, de->name,
de->name_len,
le32_to_cpu(de->inode),
get_dtype(sb, de->file_type))) {
brelse(bh);
return 0;
}
}
ctx->pos += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
}
offset = 0;
brelse(bh);
if (ctx->pos < inode->i_size) {
if (!dir_relax(inode))
return 0;
}
}
return 0;
}
static inline int is_32bit_api(void)
{
#ifdef CONFIG_COMPAT
return is_compat_task();
#else
return (BITS_PER_LONG == 32);
#endif
}
/*
* These functions convert from the major/minor hash to an f_pos
* value for dx directories
*
* Upper layer (for example NFS) should specify FMODE_32BITHASH or
* FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
* directly on both 32-bit and 64-bit nodes, under such case, neither
* FMODE_32BITHASH nor FMODE_64BITHASH is specified.
*/
static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
{
if ((filp->f_mode & FMODE_32BITHASH) ||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
return major >> 1;
else
return ((__u64)(major >> 1) << 32) | (__u64)minor;
}
static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
{
if ((filp->f_mode & FMODE_32BITHASH) ||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
return (pos << 1) & 0xffffffff;
else
return ((pos >> 32) << 1) & 0xffffffff;
}
static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
{
if ((filp->f_mode & FMODE_32BITHASH) ||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
return 0;
else
return pos & 0xffffffff;
}
/*
* Return 32- or 64-bit end-of-file for dx directories
*/
static inline loff_t ext4_get_htree_eof(struct file *filp)
{
if ((filp->f_mode & FMODE_32BITHASH) ||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
return EXT4_HTREE_EOF_32BIT;
else
return EXT4_HTREE_EOF_64BIT;
}
/*
* ext4_dir_llseek() calls generic_file_llseek_size to handle htree
* directories, where the "offset" is in terms of the filename hash
* value instead of the byte offset.
*
* Because we may return a 64-bit hash that is well beyond offset limits,
* we need to pass the max hash as the maximum allowable offset in
* the htree directory case.
*
* For non-htree, ext4_llseek already chooses the proper max offset.
*/
static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int dx_dir = is_dx_dir(inode);
loff_t htree_max = ext4_get_htree_eof(file);
if (likely(dx_dir))
return generic_file_llseek_size(file, offset, whence,
htree_max, htree_max);
else
return ext4_llseek(file, offset, whence);
}
/*
* This structure holds the nodes of the red-black tree used to store
* the directory entry in hash order.
*/
struct fname {
__u32 hash;
__u32 minor_hash;
struct rb_node rb_hash;
struct fname *next;
__u32 inode;
__u8 name_len;
__u8 file_type;
char name[0];
};
/*
* This functoin implements a non-recursive way of freeing all of the
* nodes in the red-black tree.
*/
static void free_rb_tree_fname(struct rb_root *root)
{
struct fname *fname, *next;
rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
while (fname) {
struct fname *old = fname;
fname = fname->next;
kfree(old);
}
*root = RB_ROOT;
}
static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
loff_t pos)
{
struct dir_private_info *p;
p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
p->curr_hash = pos2maj_hash(filp, pos);
p->curr_minor_hash = pos2min_hash(filp, pos);
return p;
}
void ext4_htree_free_dir_info(struct dir_private_info *p)
{
free_rb_tree_fname(&p->root);
kfree(p);
}
/*
* Given a directory entry, enter it into the fname rb tree.
*/
int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
struct ext4_dir_entry_2 *dirent)
{
struct rb_node **p, *parent = NULL;
struct fname *fname, *new_fn;
struct dir_private_info *info;
int len;
info = dir_file->private_data;
p = &info->root.rb_node;
/* Create and allocate the fname structure */
len = sizeof(struct fname) + dirent->name_len + 1;
new_fn = kzalloc(len, GFP_KERNEL);
if (!new_fn)
return -ENOMEM;
new_fn->hash = hash;
new_fn->minor_hash = minor_hash;
new_fn->inode = le32_to_cpu(dirent->inode);
new_fn->name_len = dirent->name_len;
new_fn->file_type = dirent->file_type;
memcpy(new_fn->name, dirent->name, dirent->name_len);
new_fn->name[dirent->name_len] = 0;
while (*p) {
parent = *p;
fname = rb_entry(parent, struct fname, rb_hash);
/*
* If the hash and minor hash match up, then we put
* them on a linked list. This rarely happens...
*/
if ((new_fn->hash == fname->hash) &&
(new_fn->minor_hash == fname->minor_hash)) {
new_fn->next = fname->next;
fname->next = new_fn;
return 0;
}
if (new_fn->hash < fname->hash)
p = &(*p)->rb_left;
else if (new_fn->hash > fname->hash)
p = &(*p)->rb_right;
else if (new_fn->minor_hash < fname->minor_hash)
p = &(*p)->rb_left;
else /* if (new_fn->minor_hash > fname->minor_hash) */
p = &(*p)->rb_right;
}
rb_link_node(&new_fn->rb_hash, parent, p);
rb_insert_color(&new_fn->rb_hash, &info->root);
return 0;
}
/*
* This is a helper function for ext4_dx_readdir. It calls filldir
* for all entres on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
static int call_filldir(struct file *file, struct dir_context *ctx,
struct fname *fname)
{
struct dir_private_info *info = file->private_data;
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
if (!fname) {
ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
"called with null fname?!?", __func__, __LINE__,
inode->i_ino, current->comm);
return 0;
}
ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
while (fname) {
if (!dir_emit(ctx, fname->name,
fname->name_len,
fname->inode,
get_dtype(sb, fname->file_type))) {
info->extra_fname = fname;
return 1;
}
fname = fname->next;
}
return 0;
}
static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
{
struct dir_private_info *info = file->private_data;
struct inode *inode = file_inode(file);
struct fname *fname;
int ret;
if (!info) {
info = ext4_htree_create_dir_info(file, ctx->pos);
if (!info)
return -ENOMEM;
file->private_data = info;
}
if (ctx->pos == ext4_get_htree_eof(file))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
if (info->last_pos != ctx->pos) {
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
info->curr_hash = pos2maj_hash(file, ctx->pos);
info->curr_minor_hash = pos2min_hash(file, ctx->pos);
}
/*
* If there are any leftover names on the hash collision
* chain, return them first.
*/
if (info->extra_fname) {
if (call_filldir(file, ctx, info->extra_fname))
goto finished;
info->extra_fname = NULL;
goto next_node;
} else if (!info->curr_node)
info->curr_node = rb_first(&info->root);
while (1) {
/*
* Fill the rbtree if we have no more entries,
* or the inode has changed since we last read in the
* cached entries.
*/
if ((!info->curr_node) ||
(file->f_version != inode->i_version)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
file->f_version = inode->i_version;
ret = ext4_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
return ret;
if (ret == 0) {
ctx->pos = ext4_get_htree_eof(file);
break;
}
info->curr_node = rb_first(&info->root);
}
fname = rb_entry(info->curr_node, struct fname, rb_hash);
info->curr_hash = fname->hash;
info->curr_minor_hash = fname->minor_hash;
if (call_filldir(file, ctx, fname))
break;
next_node:
info->curr_node = rb_next(info->curr_node);
if (info->curr_node) {
fname = rb_entry(info->curr_node, struct fname,
rb_hash);
info->curr_hash = fname->hash;
info->curr_minor_hash = fname->minor_hash;
} else {
if (info->next_hash == ~0) {
ctx->pos = ext4_get_htree_eof(file);
break;
}
info->curr_hash = info->next_hash;
info->curr_minor_hash = 0;
}
}
finished:
info->last_pos = ctx->pos;
return 0;
}
static int ext4_release_dir(struct inode *inode, struct file *filp)
{
if (filp->private_data)
ext4_htree_free_dir_info(filp->private_data);
return 0;
}
int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
int buf_size)
{
struct ext4_dir_entry_2 *de;
int nlen, rlen;
unsigned int offset = 0;
char *top;
de = (struct ext4_dir_entry_2 *)buf;
top = buf + buf_size;
while ((char *) de < top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
buf, buf_size, offset))
return -EIO;
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
offset += rlen;
}
if ((char *) de > top)
return -EIO;
return 0;
}
const struct file_operations ext4_dir_operations = {
.llseek = ext4_dir_llseek,
.read = generic_read_dir,
.iterate = ext4_readdir,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.fsync = ext4_sync_file,
.release = ext4_release_dir,
};

2863
fs/ext4/ext4.h Normal file

File diff suppressed because it is too large Load diff

274
fs/ext4/ext4_extents.h Normal file
View file

@ -0,0 +1,274 @@
/*
* Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
* Written by Alex Tomas <alex@clusterfs.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public Licens
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
#ifndef _EXT4_EXTENTS
#define _EXT4_EXTENTS
#include "ext4.h"
/*
* With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks
* becomes very small, so index split, in-depth growing and
* other hard changes happen much more often.
* This is for debug purposes only.
*/
#define AGGRESSIVE_TEST_
/*
* With EXTENTS_STATS defined, the number of blocks and extents
* are collected in the truncate path. They'll be shown at
* umount time.
*/
#define EXTENTS_STATS__
/*
* If CHECK_BINSEARCH is defined, then the results of the binary search
* will also be checked by linear search.
*/
#define CHECK_BINSEARCH__
/*
* If EXT_STATS is defined then stats numbers are collected.
* These number will be displayed at umount time.
*/
#define EXT_STATS_
/*
* ext4_inode has i_block array (60 bytes total).
* The first 12 bytes store ext4_extent_header;
* the remainder stores an array of ext4_extent.
* For non-inode extent blocks, ext4_extent_tail
* follows the array.
*/
/*
* This is the extent tail on-disk structure.
* All other extent structures are 12 bytes long. It turns out that
* block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
* covers all valid ext4 block sizes. Therefore, this tail structure can be
* crammed into the end of the block without having to rebalance the tree.
*/
struct ext4_extent_tail {
__le32 et_checksum; /* crc32c(uuid+inum+extent_block) */
};
/*
* This is the extent on-disk structure.
* It's used at the bottom of the tree.
*/
struct ext4_extent {
__le32 ee_block; /* first logical block extent covers */
__le16 ee_len; /* number of blocks covered by extent */
__le16 ee_start_hi; /* high 16 bits of physical block */
__le32 ee_start_lo; /* low 32 bits of physical block */
};
/*
* This is index on-disk structure.
* It's used at all the levels except the bottom.
*/
struct ext4_extent_idx {
__le32 ei_block; /* index covers logical blocks from 'block' */
__le32 ei_leaf_lo; /* pointer to the physical block of the next *
* level. leaf or next index could be there */
__le16 ei_leaf_hi; /* high 16 bits of physical block */
__u16 ei_unused;
};
/*
* Each block (leaves and indexes), even inode-stored has header.
*/
struct ext4_extent_header {
__le16 eh_magic; /* probably will support different formats */
__le16 eh_entries; /* number of valid entries */
__le16 eh_max; /* capacity of store in entries */
__le16 eh_depth; /* has tree real underlying blocks? */
__le32 eh_generation; /* generation of the tree */
};
#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a)
#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
(sizeof(struct ext4_extent_header) + \
(sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))
static inline struct ext4_extent_tail *
find_ext4_extent_tail(struct ext4_extent_header *eh)
{
return (struct ext4_extent_tail *)(((void *)eh) +
EXT4_EXTENT_TAIL_OFFSET(eh));
}
/*
* Array of ext4_ext_path contains path to some extent.
* Creation/lookup routines use it for traversal/splitting/etc.
* Truncate uses it to simulate recursive walking.
*/
struct ext4_ext_path {
ext4_fsblk_t p_block;
__u16 p_depth;
__u16 p_maxdepth;
struct ext4_extent *p_ext;
struct ext4_extent_idx *p_idx;
struct ext4_extent_header *p_hdr;
struct buffer_head *p_bh;
};
/*
* structure for external API
*/
/*
* EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
* initialized extent. This is 2^15 and not (2^16 - 1), since we use the
* MSB of ee_len field in the extent datastructure to signify if this
* particular extent is an initialized extent or an unwritten (i.e.
* preallocated).
* EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an
* unwritten extent.
* If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
* unwritten one. In other words, if MSB of ee_len is set, it is an
* unwritten extent with only one special scenario when ee_len = 0x8000.
* In this case we can not have an unwritten extent of zero length and
* thus we make it as a special case of initialized extent with 0x8000 length.
* This way we get better extent-to-group alignment for initialized extents.
* Hence, the maximum number of blocks we can have in an *initialized*
* extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).
*/
#define EXT_INIT_MAX_LEN (1UL << 15)
#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1)
#define EXT_FIRST_EXTENT(__hdr__) \
((struct ext4_extent *) (((char *) (__hdr__)) + \
sizeof(struct ext4_extent_header)))
#define EXT_FIRST_INDEX(__hdr__) \
((struct ext4_extent_idx *) (((char *) (__hdr__)) + \
sizeof(struct ext4_extent_header)))
#define EXT_HAS_FREE_INDEX(__path__) \
(le16_to_cpu((__path__)->p_hdr->eh_entries) \
< le16_to_cpu((__path__)->p_hdr->eh_max))
#define EXT_LAST_EXTENT(__hdr__) \
(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
#define EXT_LAST_INDEX(__hdr__) \
(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
#define EXT_MAX_EXTENT(__hdr__) \
(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
#define EXT_MAX_INDEX(__hdr__) \
(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
{
return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
}
static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
{
return (struct ext4_extent_header *) bh->b_data;
}
static inline unsigned short ext_depth(struct inode *inode)
{
return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}
static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)
{
/* We can not have an unwritten extent of zero length! */
BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
}
static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)
{
/* Extent with ee_len of 0x8000 is treated as an initialized extent */
return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
}
static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
{
return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ?
le16_to_cpu(ext->ee_len) :
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
{
ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
}
/*
* ext4_ext_pblock:
* combine low and high parts of physical block number into ext4_fsblk_t
*/
static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
{
ext4_fsblk_t block;
block = le32_to_cpu(ex->ee_start_lo);
block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
return block;
}
/*
* ext4_idx_pblock:
* combine low and high parts of a leaf physical block number into ext4_fsblk_t
*/
static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
{
ext4_fsblk_t block;
block = le32_to_cpu(ix->ei_leaf_lo);
block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
return block;
}
/*
* ext4_ext_store_pblock:
* stores a large physical block number into an extent struct,
* breaking it into parts
*/
static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
ext4_fsblk_t pb)
{
ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
0xffff);
}
/*
* ext4_idx_store_pblock:
* stores a large physical block number into an index struct,
* breaking it into parts
*/
static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
ext4_fsblk_t pb)
{
ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
0xffff);
}
#define ext4_ext_dirty(handle, inode, path) \
__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
struct inode *inode, struct ext4_ext_path *path);
#endif /* _EXT4_EXTENTS */

321
fs/ext4/ext4_jbd2.c Normal file
View file

@ -0,0 +1,321 @@
/*
* Interface between ext4 and JBD
*/
#include "ext4_jbd2.h"
#include <trace/events/ext4.h>
/* Just increment the non-pointer handle value */
static handle_t *ext4_get_nojournal(void)
{
handle_t *handle = current->journal_info;
unsigned long ref_cnt = (unsigned long)handle;
BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
ref_cnt++;
handle = (handle_t *)ref_cnt;
current->journal_info = handle;
return handle;
}
/* Decrement the non-pointer handle value */
static void ext4_put_nojournal(handle_t *handle)
{
unsigned long ref_cnt = (unsigned long)handle;
BUG_ON(ref_cnt == 0);
ref_cnt--;
handle = (handle_t *)ref_cnt;
current->journal_info = handle;
}
/*
* Wrappers for jbd2_journal_start/end.
*/
static int ext4_journal_check_start(struct super_block *sb)
{
journal_t *journal;
might_sleep();
if (sb->s_flags & MS_RDONLY && !ext4_journal_current_handle())
return -EROFS;
WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
journal = EXT4_SB(sb)->s_journal;
/*
* Special case here: if the journal has aborted behind our
* backs (eg. EIO in the commit thread), then we still need to
* take the FS itself readonly cleanly.
*/
if (journal && is_journal_aborted(journal)) {
ext4_abort(sb, "Detected aborted journal");
return -EROFS;
}
return 0;
}
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
int type, int blocks, int rsv_blocks)
{
journal_t *journal;
int err;
trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0)
return ERR_PTR(err);
journal = EXT4_SB(sb)->s_journal;
if (!journal)
return ext4_get_nojournal();
return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
type, line);
}
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
{
struct super_block *sb;
int err;
int rc;
if (!ext4_handle_valid(handle)) {
ext4_put_nojournal(handle);
return 0;
}
sb = handle->h_transaction->t_journal->j_private;
err = handle->h_err;
rc = jbd2_journal_stop(handle);
if (!err)
err = rc;
if (err)
__ext4_std_error(sb, where, line, err);
return err;
}
handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
int type)
{
struct super_block *sb;
int err;
if (!ext4_handle_valid(handle))
return ext4_get_nojournal();
sb = handle->h_journal->j_private;
trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
_RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0) {
jbd2_journal_free_reserved(handle);
return ERR_PTR(err);
}
err = jbd2_journal_start_reserved(handle, type, line);
if (err < 0)
return ERR_PTR(err);
return handle;
}
static void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn,
struct buffer_head *bh,
handle_t *handle, int err)
{
char nbuf[16];
const char *errstr = ext4_decode_error(NULL, err, nbuf);
BUG_ON(!ext4_handle_valid(handle));
if (bh)
BUFFER_TRACE(bh, "abort");
if (!handle->h_err)
handle->h_err = err;
if (is_handle_aborted(handle))
return;
printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
caller, line, errstr, err_fn);
jbd2_journal_abort_handle(handle);
}
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
int err = 0;
might_sleep();
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__, bh,
handle, err);
}
return err;
}
/*
* The ext4 forget function must perform a revoke if we are freeing data
* which has been journaled. Metadata (eg. indirect blocks) must be
* revoked in all cases.
*
* "bh" may be NULL: a metadata block may have been freed from memory
* but there may still be a record of it in the journal, and that record
* still needs to be revoked.
*
* If the handle isn't valid we're not journaling, but we still need to
* call into ext4_journal_revoke() to put the buffer head.
*/
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr)
{
int err;
might_sleep();
trace_ext4_forget(inode, is_metadata, blocknr);
BUFFER_TRACE(bh, "enter");
jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
"data mode %x\n",
bh, is_metadata, inode->i_mode,
test_opt(inode->i_sb, DATA_FLAGS));
/* In the no journal case, we can just do a bforget and return */
if (!ext4_handle_valid(handle)) {
bforget(bh);
return 0;
}
/* Never use the revoke function if we are doing full data
* journaling: there is no need to, and a V1 superblock won't
* support it. Otherwise, only skip the revoke on un-journaled
* data blocks. */
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(!is_metadata && !ext4_should_journal_data(inode))) {
if (bh) {
BUFFER_TRACE(bh, "call jbd2_journal_forget");
err = jbd2_journal_forget(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
return err;
}
return 0;
}
/*
* data!=journal && (is_metadata || should_journal_data(inode))
*/
BUFFER_TRACE(bh, "call jbd2_journal_revoke");
err = jbd2_journal_revoke(handle, blocknr, bh);
if (err) {
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
__ext4_abort(inode->i_sb, where, line,
"error %d when attempting revoke", err);
}
BUFFER_TRACE(bh, "exit");
return err;
}
int __ext4_journal_get_create_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
int err = 0;
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_create_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
}
return err;
}
int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
int err = 0;
might_sleep();
set_buffer_meta(bh);
set_buffer_prio(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
/* Errors can only happen due to aborted journal or a nasty bug */
if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
ext4_journal_abort_handle(where, line, __func__, bh,
handle, err);
if (inode == NULL) {
pr_err("EXT4: jbd2_journal_dirty_metadata "
"failed: handle type %u started at "
"line %u, credits %u/%u, errcode %d",
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
handle->h_buffer_credits, err);
return err;
}
ext4_error_inode(inode, where, line,
bh->b_blocknr,
"journal_dirty_metadata failed: "
"handle type %u started at line %u, "
"credits %u/%u, errcode %d",
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
handle->h_buffer_credits, err);
}
} else {
if (inode)
mark_buffer_dirty_inode(bh, inode);
else
mark_buffer_dirty(bh);
if (inode && inode_needs_sync(inode)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
struct ext4_super_block *es;
es = EXT4_SB(inode->i_sb)->s_es;
es->s_last_error_block =
cpu_to_le64(bh->b_blocknr);
ext4_error_inode(inode, where, line,
bh->b_blocknr,
"IO error syncing itable block");
err = -EIO;
}
}
}
return err;
}
int __ext4_handle_dirty_super(const char *where, unsigned int line,
handle_t *handle, struct super_block *sb)
{
struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
int err = 0;
ext4_superblock_csum_set(sb);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
} else
mark_buffer_dirty(bh);
return err;
}

450
fs/ext4/ext4_jbd2.h Normal file
View file

@ -0,0 +1,450 @@
/*
* ext4_jbd2.h
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1998--1999 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Ext4-specific journaling extensions.
*/
#ifndef _EXT4_JBD2_H
#define _EXT4_JBD2_H
#include <linux/fs.h>
#include <linux/jbd2.h>
#include "ext4.h"
#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal)
/* Define the number of blocks we need to account to a transaction to
* modify one block of data.
*
* We may have to touch one inode, one bitmap buffer, up to three
* indirection blocks, the group and superblock summaries, and the data
* block to complete the transaction.
*
* For extents-enabled fs we may have to allocate and modify up to
* 5 levels of tree, data block (for each of these we need bitmap + group
* summaries), root which is stored in the inode, sb
*/
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
? 20U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
* and the superblock, which are already accounted for. */
#define EXT4_XATTR_TRANS_BLOCKS 6U
/* Define the minimum size for a transaction which modifies data. This
* needs to take into account the fact that we may end up modifying two
* quota files too (one for the group, one for the user quota). The
* superblock only gets updated once, of course, so don't bother
* counting that again for the quota updates. */
#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
EXT4_XATTR_TRANS_BLOCKS - 2 + \
EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
/*
* Define the number of metadata blocks we need to account to modify data.
*
* This include super block, inode block, quota blocks and xattr blocks
*/
#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
/* Define an arbitrary limit for the amount of data we will anticipate
* writing to any given transaction. For unbounded transactions such as
* write(2) and truncate(2) we can write more than this, but we always
* start off at the maximum transaction size and grow the transaction
* optimistically as we go. */
#define EXT4_MAX_TRANS_DATA 64U
/* We break up a large truncate or write transaction once the handle's
* buffer credits gets this low, we need either to extend the
* transaction or to start a new one. Reserve enough space here for
* inode, bitmap, superblock, group and indirection updates for at least
* one block, plus two quota updates. Quota allocations are not
* needed. */
#define EXT4_RESERVE_TRANS_BLOCKS 12U
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
(DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_INIT_REWRITE) : 0)
#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
(DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
static inline int ext4_jbd2_credits_xattr(struct inode *inode)
{
int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
/*
* In case of inline data, we may push out the data to a block,
* so we need to reserve credits for this eventuality
*/
if (ext4_has_inline_data(inode))
credits += ext4_writepage_trans_blocks(inode) + 1;
return credits;
}
/*
* Ext4 handle operation types -- for logging purposes
*/
#define EXT4_HT_MISC 0
#define EXT4_HT_INODE 1
#define EXT4_HT_WRITE_PAGE 2
#define EXT4_HT_MAP_BLOCKS 3
#define EXT4_HT_DIR 4
#define EXT4_HT_TRUNCATE 5
#define EXT4_HT_QUOTA 6
#define EXT4_HT_RESIZE 7
#define EXT4_HT_MIGRATE 8
#define EXT4_HT_MOVE_EXTENTS 9
#define EXT4_HT_XATTR 10
#define EXT4_HT_EXT_CONVERT 11
#define EXT4_HT_MAX 12
/**
* struct ext4_journal_cb_entry - Base structure for callback information.
*
* This struct is a 'seed' structure for a using with your own callback
* structs. If you are using callbacks you must allocate one of these
* or another struct of your own definition which has this struct
* as it's first element and pass it to ext4_journal_callback_add().
*/
struct ext4_journal_cb_entry {
/* list information for other callbacks attached to the same handle */
struct list_head jce_list;
/* Function to call with this callback structure */
void (*jce_func)(struct super_block *sb,
struct ext4_journal_cb_entry *jce, int error);
/* user data goes here */
};
/**
* ext4_journal_callback_add: add a function to call after transaction commit
* @handle: active journal transaction handle to register callback on
* @func: callback function to call after the transaction has committed:
* @sb: superblock of current filesystem for transaction
* @jce: returned journal callback data
* @rc: journal state at commit (0 = transaction committed properly)
* @jce: journal callback data (internal and function private data struct)
*
* The registered function will be called in the context of the journal thread
* after the transaction for which the handle was created has completed.
*
* No locks are held when the callback function is called, so it is safe to
* call blocking functions from within the callback, but the callback should
* not block or run for too long, or the filesystem will be blocked waiting for
* the next transaction to commit. No journaling functions can be used, or
* there is a risk of deadlock.
*
* There is no guaranteed calling order of multiple registered callbacks on
* the same transaction.
*/
static inline void ext4_journal_callback_add(handle_t *handle,
void (*func)(struct super_block *sb,
struct ext4_journal_cb_entry *jce,
int rc),
struct ext4_journal_cb_entry *jce)
{
struct ext4_sb_info *sbi =
EXT4_SB(handle->h_transaction->t_journal->j_private);
/* Add the jce to transaction's private list */
jce->jce_func = func;
spin_lock(&sbi->s_md_lock);
list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
spin_unlock(&sbi->s_md_lock);
}
/**
* ext4_journal_callback_del: delete a registered callback
* @handle: active journal transaction handle on which callback was registered
* @jce: registered journal callback entry to unregister
* Return true if object was successfully removed
*/
static inline bool ext4_journal_callback_try_del(handle_t *handle,
struct ext4_journal_cb_entry *jce)
{
bool deleted;
struct ext4_sb_info *sbi =
EXT4_SB(handle->h_transaction->t_journal->j_private);
spin_lock(&sbi->s_md_lock);
deleted = !list_empty(&jce->jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
return deleted;
}
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
struct ext4_iloc *iloc);
/*
* On success, We end up with an outstanding reference count against
* iloc->bh. This _must_ be cleaned up later.
*/
int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc);
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
/*
* Wrapper functions with which ext4 calls into JBD.
*/
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh);
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
int __ext4_journal_get_create_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh);
int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle_t *handle, struct inode *inode,
struct buffer_head *bh);
int __ext4_handle_dirty_super(const char *where, unsigned int line,
handle_t *handle, struct super_block *sb);
#define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
__ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
(bh), (block_nr))
#define ext4_journal_get_create_access(handle, bh) \
__ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
(bh))
#define ext4_handle_dirty_super(handle, sb) \
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
int type, int blocks, int rsv_blocks);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
/* Note: Do not use this for NULL handles. This is only to determine if
* a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
return 0;
return 1;
}
static inline void ext4_handle_sync(handle_t *handle)
{
if (ext4_handle_valid(handle))
handle->h_sync = 1;
}
static inline int ext4_handle_is_aborted(handle_t *handle)
{
if (ext4_handle_valid(handle))
return is_handle_aborted(handle);
return 0;
}
static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
{
if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
return 0;
return 1;
}
#define ext4_journal_start_sb(sb, type, nblocks) \
__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
#define ext4_journal_start(inode, type, nblocks) \
__ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
__ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
static inline handle_t *__ext4_journal_start(struct inode *inode,
unsigned int line, int type,
int blocks, int rsv_blocks)
{
return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
rsv_blocks);
}
#define ext4_journal_stop(handle) \
__ext4_journal_stop(__func__, __LINE__, (handle))
#define ext4_journal_start_reserved(handle, type) \
__ext4_journal_start_reserved((handle), __LINE__, (type))
handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
int type);
static inline void ext4_journal_free_reserved(handle_t *handle)
{
if (ext4_handle_valid(handle))
jbd2_journal_free_reserved(handle);
}
static inline handle_t *ext4_journal_current_handle(void)
{
return journal_current_handle();
}
static inline int ext4_journal_extend(handle_t *handle, int nblocks)
{
if (ext4_handle_valid(handle))
return jbd2_journal_extend(handle, nblocks);
return 0;
}
static inline int ext4_journal_restart(handle_t *handle, int nblocks)
{
if (ext4_handle_valid(handle))
return jbd2_journal_restart(handle, nblocks);
return 0;
}
static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
return jbd2_journal_blocks_per_page(inode);
return 0;
}
static inline int ext4_journal_force_commit(journal_t *journal)
{
if (journal)
return jbd2_journal_force_commit(journal);
return 0;
}
static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
{
if (ext4_handle_valid(handle))
return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
return 0;
}
static inline void ext4_update_inode_fsync_trans(handle_t *handle,
struct inode *inode,
int datasync)
{
struct ext4_inode_info *ei = EXT4_I(inode);
if (ext4_handle_valid(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
if (datasync)
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
}
/* super.c */
int ext4_force_commit(struct super_block *sb);
/*
* Ext4 inode journal modes
*/
#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */
#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
static inline int ext4_inode_journal_mode(struct inode *inode)
{
if (EXT4_JOURNAL(inode) == NULL)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
/* We do not support data journalling with delayed allocation */
if (!S_ISREG(inode->i_mode) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
!test_opt(inode->i_sb, DELALLOC))
return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
else
BUG();
}
static inline int ext4_should_journal_data(struct inode *inode)
{
return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
}
static inline int ext4_should_order_data(struct inode *inode)
{
return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
}
static inline int ext4_should_writeback_data(struct inode *inode)
{
return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
* i_mutex for direct I/O reads. This only works for extent-based
* files, and it doesn't work if data journaling is enabled, since the
* dioread_nolock code uses b_private to pass information back to the
* I/O completion handler, and this conflicts with the jbd's use of
* b_private.
*/
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return 0;
if (ext4_should_journal_data(inode))
return 0;
return 1;
}
#endif /* _EXT4_JBD2_H */

5742
fs/ext4/extents.c Normal file

File diff suppressed because it is too large Load diff

1299
fs/ext4/extents_status.c Normal file

File diff suppressed because it is too large Load diff

157
fs/ext4/extents_status.h Normal file
View file

@ -0,0 +1,157 @@
/*
* fs/ext4/extents_status.h
*
* Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
* Modified by
* Allison Henderson <achender@linux.vnet.ibm.com>
* Zheng Liu <wenqing.lz@taobao.com>
*
*/
#ifndef _EXT4_EXTENTS_STATUS_H
#define _EXT4_EXTENTS_STATUS_H
/*
* Turn on ES_DEBUG__ to get lots of info about extent status operations.
*/
#ifdef ES_DEBUG__
#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
#else
#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
/*
* With ES_AGGRESSIVE_TEST defined, the result of es caching will be
* checked with old map_block's result.
*/
#define ES_AGGRESSIVE_TEST__
/*
* These flags live in the high bits of extent_status.es_pblk
*/
#define ES_SHIFT 60
#define EXTENT_STATUS_WRITTEN (1 << 3)
#define EXTENT_STATUS_UNWRITTEN (1 << 2)
#define EXTENT_STATUS_DELAYED (1 << 1)
#define EXTENT_STATUS_HOLE (1 << 0)
#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
EXTENT_STATUS_UNWRITTEN | \
EXTENT_STATUS_DELAYED | \
EXTENT_STATUS_HOLE)
#define ES_WRITTEN (1ULL << 63)
#define ES_UNWRITTEN (1ULL << 62)
#define ES_DELAYED (1ULL << 61)
#define ES_HOLE (1ULL << 60)
#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \
ES_DELAYED | ES_HOLE)
struct ext4_sb_info;
struct ext4_extent;
struct extent_status {
struct rb_node rb_node;
ext4_lblk_t es_lblk; /* first logical block extent covers */
ext4_lblk_t es_len; /* length of extent in block */
ext4_fsblk_t es_pblk; /* first physical block */
};
struct ext4_es_tree {
struct rb_root root;
struct extent_status *cache_es; /* recently accessed extent */
};
struct ext4_es_stats {
unsigned long es_stats_last_sorted;
unsigned long es_stats_shrunk;
unsigned long es_stats_cache_hits;
unsigned long es_stats_cache_misses;
u64 es_stats_scan_time;
u64 es_stats_max_scan_time;
struct percpu_counter es_stats_all_cnt;
struct percpu_counter es_stats_lru_cnt;
};
extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_es_find_delayed_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es);
static inline int ext4_es_is_written(struct extent_status *es)
{
return (es->es_pblk & ES_WRITTEN) != 0;
}
static inline int ext4_es_is_unwritten(struct extent_status *es)
{
return (es->es_pblk & ES_UNWRITTEN) != 0;
}
static inline int ext4_es_is_delayed(struct extent_status *es)
{
return (es->es_pblk & ES_DELAYED) != 0;
}
static inline int ext4_es_is_hole(struct extent_status *es)
{
return (es->es_pblk & ES_HOLE) != 0;
}
static inline unsigned int ext4_es_status(struct extent_status *es)
{
return es->es_pblk >> ES_SHIFT;
}
static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
{
return es->es_pblk & ~ES_MASK;
}
static inline void ext4_es_store_pblock(struct extent_status *es,
ext4_fsblk_t pb)
{
ext4_fsblk_t block;
block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
es->es_pblk = block;
}
static inline void ext4_es_store_status(struct extent_status *es,
unsigned int status)
{
es->es_pblk = (((ext4_fsblk_t)
(status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
(es->es_pblk & ~ES_MASK));
}
static inline void ext4_es_store_pblock_status(struct extent_status *es,
ext4_fsblk_t pb,
unsigned int status)
{
es->es_pblk = (((ext4_fsblk_t)
(status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
(pb & ~ES_MASK));
}
extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
extern void ext4_es_lru_del(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */

614
fs/ext4/file.c Normal file
View file

@ -0,0 +1,614 @@
/*
* linux/fs/ext4/file.c
*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
*
* from
*
* linux/fs/minix/file.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* ext4 fs regular file handling primitives
*
* 64-bit file support on 64-bit platforms by Jakub Jelinek
* (jj@sunsite.ms.mff.cuni.cz)
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
#include <linux/aio.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
/*
* Called when an inode is released. Note that this is different
* from ext4_file_open: open gets called at every open, but release
* gets called only when /all/ the files are closed.
*/
static int ext4_release_file(struct inode *inode, struct file *filp)
{
if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
ext4_alloc_da_blocks(inode);
ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
}
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1) &&
!EXT4_I(inode)->i_reserved_data_blocks)
{
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
ext4_htree_free_dir_info(filp->private_data);
return 0;
}
static void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
}
/*
* This tests whether the IO in question is block-aligned or not.
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
* are converted to written only after the IO is complete. Until they are
* mapped, these blocks appear as holes, so dio_zero_block() will assume that
* it needs to zero out portions of the start and/or end block. If 2 AIO
* threads are at work on the same unwritten block, they must be synchronized
* or one thread will zero the other's data, causing corruption.
*/
static int
ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
{
struct super_block *sb = inode->i_sb;
int blockmask = sb->s_blocksize - 1;
if (pos >= i_size_read(inode))
return 0;
if ((pos | iov_iter_alignment(from)) & blockmask)
return 1;
return 0;
}
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(iocb->ki_filp);
struct mutex *aio_mutex = NULL;
struct blk_plug plug;
int o_direct = file->f_flags & O_DIRECT;
int overwrite = 0;
size_t length = iov_iter_count(from);
ssize_t ret;
loff_t pos = iocb->ki_pos;
/*
* Unaligned direct AIO must be serialized; see comment above
* In the case of O_APPEND, assume that we must always serialize
*/
if (o_direct &&
ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
!is_sync_kiocb(iocb) &&
(file->f_flags & O_APPEND ||
ext4_unaligned_aio(inode, from, pos))) {
aio_mutex = ext4_aio_mutex(inode);
mutex_lock(aio_mutex);
ext4_unwritten_wait(inode);
}
mutex_lock(&inode->i_mutex);
if (file->f_flags & O_APPEND)
iocb->ki_pos = pos = i_size_read(inode);
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
*/
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if ((pos > sbi->s_bitmap_maxbytes) ||
(pos == sbi->s_bitmap_maxbytes && length > 0)) {
mutex_unlock(&inode->i_mutex);
ret = -EFBIG;
goto errout;
}
if (pos + length > sbi->s_bitmap_maxbytes)
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
}
iocb->private = &overwrite;
if (o_direct) {
blk_start_plug(&plug);
/* check whether we do a DIO overwrite or not */
if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
!file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
int err, len;
map.m_lblk = pos >> blkbits;
map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
- map.m_lblk;
len = map.m_len;
err = ext4_map_blocks(NULL, inode, &map, 0);
/*
* 'err==len' means that all of blocks has
* been preallocated no matter they are
* initialized or not. For excluding
* unwritten extents, we need to check
* m_flags. There are two conditions that
* indicate for initialized extents. 1) If we
* hit extent cache, EXT4_MAP_MAPPED flag is
* returned; 2) If we do a real lookup,
* non-flags are returned. So we should check
* these two conditions.
*/
if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
overwrite = 1;
}
}
ret = __generic_file_write_iter(iocb, from);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, iocb->ki_pos - ret, ret);
if (err < 0)
ret = err;
}
if (o_direct)
blk_finish_plug(&plug);
errout:
if (aio_mutex)
mutex_unlock(aio_mutex);
return ret;
}
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &ext4_file_vm_ops;
return 0;
}
static int ext4_file_open(struct inode * inode, struct file * filp)
{
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct vfsmount *mnt = filp->f_path.mnt;
struct path path;
char buf[64], *cp;
if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
!(sb->s_flags & MS_RDONLY))) {
sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
/*
* Sample where the filesystem has been mounted and
* store it in the superblock for sysadmin convenience
* when trying to sort through large numbers of block
* devices or filesystem images.
*/
memset(buf, 0, sizeof(buf));
path.mnt = mnt;
path.dentry = mnt->mnt_root;
cp = d_path(&path, buf, sizeof(buf));
if (!IS_ERR(cp)) {
handle_t *handle;
int err;
handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err) {
ext4_journal_stop(handle);
return err;
}
strlcpy(sbi->s_es->s_last_mounted, cp,
sizeof(sbi->s_es->s_last_mounted));
ext4_handle_dirty_super(handle, sb);
ext4_journal_stop(handle);
}
}
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
*/
if (filp->f_mode & FMODE_WRITE) {
int ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
return ret;
}
return dquot_file_open(inode, filp);
}
/*
* Here we use ext4_map_blocks() to get a block mapping for a extent-based
* file rather than ext4_ext_walk_space() because we can introduce
* SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
* function. When extent status tree has been fully implemented, it will
* track all extent status for a file and we can directly use it to
* retrieve the offset for SEEK_DATA/SEEK_HOLE.
*/
/*
* When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
* lookup page cache to check whether or not there has some data between
* [startoff, endoff] because, if this range contains an unwritten extent,
* we determine this extent as a data or a hole according to whether the
* page cache has data or not.
*/
static int ext4_find_unwritten_pgoff(struct inode *inode,
int whence,
struct ext4_map_blocks *map,
loff_t *offset)
{
struct pagevec pvec;
unsigned int blkbits;
pgoff_t index;
pgoff_t end;
loff_t endoff;
loff_t startoff;
loff_t lastoff;
int found = 0;
blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
pagevec_init(&pvec, 0);
do {
int i, num;
unsigned long nr_pages;
num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
(pgoff_t)num);
if (nr_pages == 0) {
if (whence == SEEK_DATA)
break;
BUG_ON(whence != SEEK_HOLE);
/*
* If this is the first time to go into the loop and
* offset is not beyond the end offset, it will be a
* hole at this offset
*/
if (lastoff == startoff || lastoff < endoff)
found = 1;
break;
}
/*
* If this is the first time to go into the loop and
* offset is smaller than the first page offset, it will be a
* hole at this offset.
*/
if (lastoff == startoff && whence == SEEK_HOLE &&
lastoff < page_offset(pvec.pages[0])) {
found = 1;
break;
}
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
struct buffer_head *bh, *head;
/*
* If the current offset is not beyond the end of given
* range, it will be a hole.
*/
if (lastoff < endoff && whence == SEEK_HOLE &&
page->index > end) {
found = 1;
*offset = lastoff;
goto out;
}
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping)) {
unlock_page(page);
continue;
}
if (!page_has_buffers(page)) {
unlock_page(page);
continue;
}
if (page_has_buffers(page)) {
lastoff = page_offset(page);
bh = head = page_buffers(page);
do {
if (buffer_uptodate(bh) ||
buffer_unwritten(bh)) {
if (whence == SEEK_DATA)
found = 1;
} else {
if (whence == SEEK_HOLE)
found = 1;
}
if (found) {
*offset = max_t(loff_t,
startoff, lastoff);
unlock_page(page);
goto out;
}
lastoff += bh->b_size;
bh = bh->b_this_page;
} while (bh != head);
}
lastoff = page_offset(page) + PAGE_SIZE;
unlock_page(page);
}
/*
* The no. of pages is less than our desired, that would be a
* hole in there.
*/
if (nr_pages < num && whence == SEEK_HOLE) {
found = 1;
*offset = lastoff;
break;
}
index = pvec.pages[i - 1]->index + 1;
pagevec_release(&pvec);
} while (index <= end);
out:
pagevec_release(&pvec);
return found;
}
/*
* ext4_seek_data() retrieves the offset for SEEK_DATA.
*/
static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
struct ext4_map_blocks map;
struct extent_status es;
ext4_lblk_t start, last, end;
loff_t dataoff, isize;
int blkbits;
int ret = 0;
mutex_lock(&inode->i_mutex);
isize = i_size_read(inode);
if (offset >= isize) {
mutex_unlock(&inode->i_mutex);
return -ENXIO;
}
blkbits = inode->i_sb->s_blocksize_bits;
start = offset >> blkbits;
last = start;
end = isize >> blkbits;
dataoff = offset;
do {
map.m_lblk = last;
map.m_len = end - last + 1;
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
if (last != start)
dataoff = (loff_t)last << blkbits;
break;
}
/*
* If there is a delay extent at this offset,
* it will be as a data.
*/
ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
if (last != start)
dataoff = (loff_t)last << blkbits;
break;
}
/*
* If there is a unwritten extent at this offset,
* it will be as a data or a hole according to page
* cache that has data or not.
*/
if (map.m_flags & EXT4_MAP_UNWRITTEN) {
int unwritten;
unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
&map, &dataoff);
if (unwritten)
break;
}
last++;
dataoff = (loff_t)last << blkbits;
} while (last <= end);
mutex_unlock(&inode->i_mutex);
if (dataoff > isize)
return -ENXIO;
return vfs_setpos(file, dataoff, maxsize);
}
/*
* ext4_seek_hole() retrieves the offset for SEEK_HOLE.
*/
static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
struct ext4_map_blocks map;
struct extent_status es;
ext4_lblk_t start, last, end;
loff_t holeoff, isize;
int blkbits;
int ret = 0;
mutex_lock(&inode->i_mutex);
isize = i_size_read(inode);
if (offset >= isize) {
mutex_unlock(&inode->i_mutex);
return -ENXIO;
}
blkbits = inode->i_sb->s_blocksize_bits;
start = offset >> blkbits;
last = start;
end = isize >> blkbits;
holeoff = offset;
do {
map.m_lblk = last;
map.m_len = end - last + 1;
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
last += ret;
holeoff = (loff_t)last << blkbits;
continue;
}
/*
* If there is a delay extent at this offset,
* we will skip this extent.
*/
ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
last = es.es_lblk + es.es_len;
holeoff = (loff_t)last << blkbits;
continue;
}
/*
* If there is a unwritten extent at this offset,
* it will be as a data or a hole according to page
* cache that has data or not.
*/
if (map.m_flags & EXT4_MAP_UNWRITTEN) {
int unwritten;
unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
&map, &holeoff);
if (!unwritten) {
last += ret;
holeoff = (loff_t)last << blkbits;
continue;
}
}
/* find a hole */
break;
} while (last <= end);
mutex_unlock(&inode->i_mutex);
if (holeoff > isize)
holeoff = isize;
return vfs_setpos(file, holeoff, maxsize);
}
/*
* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
* by calling generic_file_llseek_size() with the appropriate maxbytes
* value for each.
*/
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
loff_t maxbytes;
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
else
maxbytes = inode->i_sb->s_maxbytes;
switch (whence) {
case SEEK_SET:
case SEEK_CUR:
case SEEK_END:
return generic_file_llseek_size(file, offset, whence,
maxbytes, i_size_read(inode));
case SEEK_DATA:
return ext4_seek_data(file, offset, maxbytes);
case SEEK_HOLE:
return ext4_seek_hole(file, offset, maxbytes);
}
return -EINVAL;
}
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read = new_sync_read,
.write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};

151
fs/ext4/fsync.c Normal file
View file

@ -0,0 +1,151 @@
/*
* linux/fs/ext4/fsync.c
*
* Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
* from
* Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
* from
* linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
*
* ext4fs fsync primitive
*
* Big-endian to little-endian byte-swapping/bitmaps by
* David S. Miller (davem@caip.rutgers.edu), 1995
*
* Removed unnecessary code duplication for little endian machines
* and excessive __inline__s.
* Andi Kleen, 1997
*
* Major simplications and cleanup - we only need to do the metadata, because
* we can depend on generic_block_fdatasync() to sync the data blocks.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/jbd2.h>
#include <linux/blkdev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include <trace/events/ext4.h>
/*
* If we're not journaling and this is a just-created file, we have to
* sync our parent directory (if it was freshly created) since
* otherwise it will only be written by writeback, leaving a huge
* window during which a crash may lose the file. This may apply for
* the parent directory's parent as well, and so on recursively, if
* they are also freshly created.
*/
static int ext4_sync_parent(struct inode *inode)
{
struct dentry *dentry = NULL;
struct inode *next;
int ret = 0;
if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
return 0;
inode = igrab(inode);
while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
dentry = d_find_any_alias(inode);
if (!dentry)
break;
next = igrab(dentry->d_parent->d_inode);
dput(dentry);
if (!next)
break;
iput(inode);
inode = next;
ret = sync_mapping_buffers(inode->i_mapping);
if (ret)
break;
ret = sync_inode_metadata(inode, 1);
if (ret)
break;
}
iput(inode);
return ret;
}
/*
* akpm: A new design for ext4_sync_file().
*
* This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
* There cannot be a transaction open by this task.
* Another task could have dirtied this inode. Its data can be in any
* state in the journalling system.
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
*/
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0, err;
tid_t commit_tid;
bool needs_barrier = false;
J_ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file_enter(file, datasync);
if (inode->i_sb->s_flags & MS_RDONLY) {
/* Make sure that we read updated s_mount_flags value */
smp_rmb();
if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
ret = -EROFS;
goto out;
}
if (!journal) {
ret = generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
}
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret)
return ret;
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
* Metadata is in the journal, we wait for proper transaction to
* commit here.
*
* data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean).
* ext4_force_commit will write the file data into the journal and
* will wait on that.
* filemap_fdatawait() will encounter a ton of newly-dirtied pages
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
if (ext4_should_journal_data(inode)) {
ret = ext4_force_commit(inode->i_sb);
goto out;
}
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
needs_barrier = true;
ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
ret = err;
}
out:
trace_ext4_sync_file_exit(inode, ret);
return ret;
}

208
fs/ext4/hash.c Normal file
View file

@ -0,0 +1,208 @@
/*
* linux/fs/ext4/hash.c
*
* Copyright (C) 2002 by Theodore Ts'o
*
* This file is released under the GPL v2.
*
* This file may be redistributed under the terms of the GNU Public
* License.
*/
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/cryptohash.h>
#include "ext4.h"
#define DELTA 0x9E3779B9
static void TEA_transform(__u32 buf[4], __u32 const in[])
{
__u32 sum = 0;
__u32 b0 = buf[0], b1 = buf[1];
__u32 a = in[0], b = in[1], c = in[2], d = in[3];
int n = 16;
do {
sum += DELTA;
b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
} while (--n);
buf[0] += b0;
buf[1] += b1;
}
/* The old legacy hash */
static __u32 dx_hack_hash_unsigned(const char *name, int len)
{
__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
const unsigned char *ucp = (const unsigned char *) name;
while (len--) {
hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
if (hash & 0x80000000)
hash -= 0x7fffffff;
hash1 = hash0;
hash0 = hash;
}
return hash0 << 1;
}
static __u32 dx_hack_hash_signed(const char *name, int len)
{
__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
const signed char *scp = (const signed char *) name;
while (len--) {
hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
if (hash & 0x80000000)
hash -= 0x7fffffff;
hash1 = hash0;
hash0 = hash;
}
return hash0 << 1;
}
static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
{
__u32 pad, val;
int i;
const signed char *scp = (const signed char *) msg;
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
val = pad;
if (len > num*4)
len = num * 4;
for (i = 0; i < len; i++) {
if ((i % 4) == 0)
val = pad;
val = ((int) scp[i]) + (val << 8);
if ((i % 4) == 3) {
*buf++ = val;
val = pad;
num--;
}
}
if (--num >= 0)
*buf++ = val;
while (--num >= 0)
*buf++ = pad;
}
static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
{
__u32 pad, val;
int i;
const unsigned char *ucp = (const unsigned char *) msg;
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
val = pad;
if (len > num*4)
len = num * 4;
for (i = 0; i < len; i++) {
if ((i % 4) == 0)
val = pad;
val = ((int) ucp[i]) + (val << 8);
if ((i % 4) == 3) {
*buf++ = val;
val = pad;
num--;
}
}
if (--num >= 0)
*buf++ = val;
while (--num >= 0)
*buf++ = pad;
}
/*
* Returns the hash of a filename. If len is 0 and name is NULL, then
* this function can be used to test whether or not a hash version is
* supported.
*
* The seed is an 4 longword (32 bits) "secret" which can be used to
* uniquify a hash. If the seed is all zero's, then some default seed
* may be used.
*
* A particular hash version specifies whether or not the seed is
* represented, and whether or not the returned hash is 32 bits or 64
* bits. 32 bit hashes will return 0 for the minor hash.
*/
int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
{
__u32 hash;
__u32 minor_hash = 0;
const char *p;
int i;
__u32 in[8], buf[4];
void (*str2hashbuf)(const char *, int, __u32 *, int) =
str2hashbuf_signed;
/* Initialize the default seed for the hash checksum functions */
buf[0] = 0x67452301;
buf[1] = 0xefcdab89;
buf[2] = 0x98badcfe;
buf[3] = 0x10325476;
/* Check to see if the seed is all zero's */
if (hinfo->seed) {
for (i = 0; i < 4; i++) {
if (hinfo->seed[i]) {
memcpy(buf, hinfo->seed, sizeof(buf));
break;
}
}
}
switch (hinfo->hash_version) {
case DX_HASH_LEGACY_UNSIGNED:
hash = dx_hack_hash_unsigned(name, len);
break;
case DX_HASH_LEGACY:
hash = dx_hack_hash_signed(name, len);
break;
case DX_HASH_HALF_MD4_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
(*str2hashbuf)(p, len, in, 8);
half_md4_transform(buf, in);
len -= 32;
p += 32;
}
minor_hash = buf[2];
hash = buf[1];
break;
case DX_HASH_TEA_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
case DX_HASH_TEA:
p = name;
while (len > 0) {
(*str2hashbuf)(p, len, in, 4);
TEA_transform(buf, in);
len -= 16;
p += 16;
}
hash = buf[0];
minor_hash = buf[1];
break;
default:
hinfo->hash = 0;
return -1;
}
hash = hash & ~1;
if (hash == (EXT4_HTREE_EOF_32BIT << 1))
hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
hinfo->hash = hash;
hinfo->minor_hash = minor_hash;
return 0;
}

1331
fs/ext4/ialloc.c Normal file

File diff suppressed because it is too large Load diff

1549
fs/ext4/indirect.c Normal file

File diff suppressed because it is too large Load diff

2007
fs/ext4/inline.c Normal file

File diff suppressed because it is too large Load diff

5096
fs/ext4/inode.c Normal file

File diff suppressed because it is too large Load diff

700
fs/ext4/ioctl.c Normal file
View file

@ -0,0 +1,700 @@
/*
* linux/fs/ext4/ioctl.c
*
* Copyright (C) 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
*/
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/capability.h>
#include <linux/time.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/file.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
/**
* Swap memory between @a and @b for @len bytes.
*
* @a: pointer to first memory area
* @b: pointer to second memory area
* @len: number of bytes to swap
*
*/
static void memswap(void *a, void *b, size_t len)
{
unsigned char *ap, *bp;
unsigned char tmp;
ap = (unsigned char *)a;
bp = (unsigned char *)b;
while (len-- > 0) {
tmp = *ap;
*ap = *bp;
*bp = tmp;
ap++;
bp++;
}
}
/**
* Swap i_data and associated attributes between @inode1 and @inode2.
* This function is used for the primary swap between inode1 and inode2
* and also to revert this primary swap in case of errors.
*
* Therefore you have to make sure, that calling this method twice
* will revert all changes.
*
* @inode1: pointer to first inode
* @inode2: pointer to second inode
*/
static void swap_inode_data(struct inode *inode1, struct inode *inode2)
{
loff_t isize;
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
memswap(&inode1->i_version, &inode2->i_version,
sizeof(inode1->i_version));
memswap(&inode1->i_blocks, &inode2->i_blocks,
sizeof(inode1->i_blocks));
memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
ext4_es_lru_del(inode1);
ext4_es_lru_del(inode2);
isize = i_size_read(inode1);
i_size_write(inode1, i_size_read(inode2));
i_size_write(inode2, isize);
}
/**
* Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
* important fields of the inodes.
*
* @sb: the super block of the filesystem
* @inode: the inode to swap with EXT4_BOOT_LOADER_INO
*
*/
static long swap_inode_boot_loader(struct super_block *sb,
struct inode *inode)
{
handle_t *handle;
int err;
struct inode *inode_bl;
struct ext4_inode_info *ei_bl;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
return -EINVAL;
if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
return -EPERM;
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
filemap_flush(inode->i_mapping);
filemap_flush(inode_bl->i_mapping);
/* Protect orig inodes against a truncate and make sure,
* that only 1 swap_inode_boot_loader is running. */
lock_two_nondirectories(inode, inode_bl);
truncate_inode_pages(&inode->i_data, 0);
truncate_inode_pages(&inode_bl->i_data, 0);
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(inode);
ext4_inode_block_unlocked_dio(inode_bl);
inode_dio_wait(inode);
inode_dio_wait(inode_bl);
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
goto journal_err_out;
}
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
if (inode_bl->i_nlink == 0) {
/* this inode has never been used as a BOOT_LOADER */
set_nlink(inode_bl, 1);
i_uid_write(inode_bl, 0);
i_gid_write(inode_bl, 0);
inode_bl->i_flags = 0;
ei_bl->i_flags = 0;
inode_bl->i_version = 1;
i_size_write(inode_bl, 0);
inode_bl->i_mode = S_IFREG;
if (EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_EXTENTS)) {
ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode_bl);
} else
memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
}
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
inode_bl->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
ext4_discard_preallocations(inode);
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
ext4_warning(inode->i_sb,
"couldn't mark inode #%lu dirty (err %d)",
inode->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
} else {
err = ext4_mark_inode_dirty(handle, inode_bl);
if (err < 0) {
ext4_warning(inode_bl->i_sb,
"couldn't mark inode #%lu dirty (err %d)",
inode_bl->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
ext4_mark_inode_dirty(handle, inode);
}
}
ext4_journal_stop(handle);
ext4_double_up_write_data_sem(inode, inode_bl);
journal_err_out:
ext4_inode_resume_unlocked_dio(inode);
ext4_inode_resume_unlocked_dio(inode_bl);
unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
return err;
}
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int flags;
ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
switch (cmd) {
case EXT4_IOC_GETFLAGS:
ext4_get_inode_flags(ei);
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
case EXT4_IOC_SETFLAGS: {
handle_t *handle = NULL;
int err, migrate = 0;
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
unsigned int jflag;
if (!inode_owner_or_capable(inode))
return -EACCES;
if (get_user(flags, (int __user *) arg))
return -EFAULT;
err = mnt_want_write_file(filp);
if (err)
return err;
flags = ext4_mask_flags(inode->i_mode, flags);
err = -EPERM;
mutex_lock(&inode->i_mutex);
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode))
goto flags_out;
oldflags = ei->i_flags;
/* The JOURNAL_DATA flag is modifiable only by root */
jflag = flags & EXT4_JOURNAL_DATA_FL;
/*
* The IMMUTABLE and APPEND_ONLY flags can only be changed by
* the relevant capability.
*
* This test looks nicer. Thanks to Pauline Middelink
*/
if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE))
goto flags_out;
}
/*
* The JOURNAL_DATA flag can only be changed by
* the relevant capability.
*/
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
if (flags & EXT4_EOFBLOCKS_FL) {
/* we don't support adding EOFBLOCKS flag */
if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
err = -EOPNOTSUPP;
goto flags_out;
}
} else if (oldflags & EXT4_EOFBLOCKS_FL)
ext4_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto flags_out;
}
if (IS_SYNC(inode))
ext4_handle_sync(handle);
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
goto flags_err;
for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
if (!(mask & EXT4_FL_USER_MODIFIABLE))
continue;
if (mask & flags)
ext4_set_inode_flag(inode, i);
else
ext4_clear_inode_flag(inode, i);
}
ext4_set_inode_flags(inode);
inode->i_ctime = ext4_current_time(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
ext4_journal_stop(handle);
if (err)
goto flags_out;
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
err = ext4_change_inode_journal_flag(inode, jflag);
if (err)
goto flags_out;
if (migrate) {
if (flags & EXT4_EXTENTS_FL)
err = ext4_ext_migrate(inode);
else
err = ext4_ind_migrate(inode);
}
flags_out:
mutex_unlock(&inode->i_mutex);
mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_GETVERSION:
case EXT4_IOC_GETVERSION_OLD:
return put_user(inode->i_generation, (int __user *) arg);
case EXT4_IOC_SETVERSION:
case EXT4_IOC_SETVERSION_OLD: {
handle_t *handle;
struct ext4_iloc iloc;
__u32 generation;
int err;
if (!inode_owner_or_capable(inode))
return -EPERM;
if (ext4_has_metadata_csum(inode->i_sb)) {
ext4_warning(sb, "Setting inode version is not "
"supported with metadata_csum enabled.");
return -ENOTTY;
}
err = mnt_want_write_file(filp);
if (err)
return err;
if (get_user(generation, (int __user *) arg)) {
err = -EFAULT;
goto setversion_out;
}
mutex_lock(&inode->i_mutex);
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto unlock_out;
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = ext4_current_time(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
ext4_journal_stop(handle);
unlock_out:
mutex_unlock(&inode->i_mutex);
setversion_out:
mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_GROUP_EXTEND: {
ext4_fsblk_t n_blocks_count;
int err, err2=0;
err = ext4_resize_begin(sb);
if (err)
return err;
if (get_user(n_blocks_count, (__u32 __user *)arg)) {
err = -EFAULT;
goto group_extend_out;
}
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
err = -EOPNOTSUPP;
goto group_extend_out;
}
err = mnt_want_write_file(filp);
if (err)
goto group_extend_out;
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
group_extend_out:
ext4_resize_end(sb);
return err;
}
case EXT4_IOC_MOVE_EXT: {
struct move_extent me;
struct fd donor;
int err;
if (!(filp->f_mode & FMODE_READ) ||
!(filp->f_mode & FMODE_WRITE))
return -EBADF;
if (copy_from_user(&me,
(struct move_extent __user *)arg, sizeof(me)))
return -EFAULT;
me.moved_len = 0;
donor = fdget(me.donor_fd);
if (!donor.file)
return -EBADF;
if (!(donor.file->f_mode & FMODE_WRITE)) {
err = -EBADF;
goto mext_out;
}
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with bigalloc");
err = -EOPNOTSUPP;
goto mext_out;
}
err = mnt_want_write_file(filp);
if (err)
goto mext_out;
err = ext4_move_extents(filp, donor.file, me.orig_start,
me.donor_start, me.len, &me.moved_len);
mnt_drop_write_file(filp);
if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
err = -EFAULT;
mext_out:
fdput(donor);
return err;
}
case EXT4_IOC_GROUP_ADD: {
struct ext4_new_group_data input;
int err, err2=0;
err = ext4_resize_begin(sb);
if (err)
return err;
if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
sizeof(input))) {
err = -EFAULT;
goto group_add_out;
}
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
err = -EOPNOTSUPP;
goto group_add_out;
}
err = mnt_want_write_file(filp);
if (err)
goto group_add_out;
err = ext4_group_add(sb, &input);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
if (!err && ext4_has_group_desc_csum(sb) &&
test_opt(sb, INIT_INODE_TABLE))
err = ext4_register_li_request(sb, input.group);
group_add_out:
ext4_resize_end(sb);
return err;
}
case EXT4_IOC_MIGRATE:
{
int err;
if (!inode_owner_or_capable(inode))
return -EACCES;
err = mnt_want_write_file(filp);
if (err)
return err;
/*
* inode_mutex prevent write and truncate on the file.
* Read still goes through. We take i_data_sem in
* ext4_ext_swap_inode_data before we switch the
* inode format to prevent read.
*/
mutex_lock(&(inode->i_mutex));
err = ext4_ext_migrate(inode);
mutex_unlock(&(inode->i_mutex));
mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_ALLOC_DA_BLKS:
{
int err;
if (!inode_owner_or_capable(inode))
return -EACCES;
err = mnt_want_write_file(filp);
if (err)
return err;
err = ext4_alloc_da_blocks(inode);
mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_SWAP_BOOT:
{
int err;
if (!(filp->f_mode & FMODE_WRITE))
return -EBADF;
err = mnt_want_write_file(filp);
if (err)
return err;
err = swap_inode_boot_loader(sb, inode);
mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_RESIZE_FS: {
ext4_fsblk_t n_blocks_count;
int err = 0, err2 = 0;
ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not (yet) supported with bigalloc");
return -EOPNOTSUPP;
}
if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
sizeof(__u64))) {
return -EFAULT;
}
err = ext4_resize_begin(sb);
if (err)
return err;
err = mnt_want_write_file(filp);
if (err)
goto resizefs_out;
err = ext4_resize_fs(sb, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
ext4_has_group_desc_csum(sb) &&
test_opt(sb, INIT_INODE_TABLE))
err = ext4_register_li_request(sb, o_group);
resizefs_out:
ext4_resize_end(sb);
return err;
}
case FIDTRIM:
case FITRIM:
{
struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
int flags = cmd == FIDTRIM ? BLKDEV_DISCARD_SECURE : 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
if ((flags & BLKDEV_DISCARD_SECURE) && !blk_queue_secdiscard(q))
return -EOPNOTSUPP;
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
range.minlen = max((unsigned int)range.minlen,
q->limits.discard_granularity);
ret = ext4_trim_fs(sb, &range, flags);
if (ret < 0)
return ret;
if (copy_to_user((struct fstrim_range __user *)arg, &range,
sizeof(range)))
return -EFAULT;
return 0;
}
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
case FS_IOC_INVAL_MAPPING:
{
return invalidate_mapping_pages(inode->i_mapping, 0, -1);
}
default:
return -ENOTTY;
}
}
#ifdef CONFIG_COMPAT
long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
case EXT4_IOC32_GETFLAGS:
cmd = EXT4_IOC_GETFLAGS;
break;
case EXT4_IOC32_SETFLAGS:
cmd = EXT4_IOC_SETFLAGS;
break;
case EXT4_IOC32_GETVERSION:
cmd = EXT4_IOC_GETVERSION;
break;
case EXT4_IOC32_SETVERSION:
cmd = EXT4_IOC_SETVERSION;
break;
case EXT4_IOC32_GROUP_EXTEND:
cmd = EXT4_IOC_GROUP_EXTEND;
break;
case EXT4_IOC32_GETVERSION_OLD:
cmd = EXT4_IOC_GETVERSION_OLD;
break;
case EXT4_IOC32_SETVERSION_OLD:
cmd = EXT4_IOC_SETVERSION_OLD;
break;
case EXT4_IOC32_GETRSVSZ:
cmd = EXT4_IOC_GETRSVSZ;
break;
case EXT4_IOC32_SETRSVSZ:
cmd = EXT4_IOC_SETRSVSZ;
break;
case EXT4_IOC32_GROUP_ADD: {
struct compat_ext4_new_group_input __user *uinput;
struct ext4_new_group_input input;
mm_segment_t old_fs;
int err;
uinput = compat_ptr(arg);
err = get_user(input.group, &uinput->group);
err |= get_user(input.block_bitmap, &uinput->block_bitmap);
err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
err |= get_user(input.inode_table, &uinput->inode_table);
err |= get_user(input.blocks_count, &uinput->blocks_count);
err |= get_user(input.reserved_blocks,
&uinput->reserved_blocks);
if (err)
return -EFAULT;
old_fs = get_fs();
set_fs(KERNEL_DS);
err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
(unsigned long) &input);
set_fs(old_fs);
return err;
}
case EXT4_IOC_MOVE_EXT:
case FITRIM:
case EXT4_IOC_RESIZE_FS:
case EXT4_IOC_PRECACHE_EXTENTS:
break;
default:
return -ENOIOCTLCMD;
}
return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif

5266
fs/ext4/mballoc.c Normal file

File diff suppressed because it is too large Load diff

215
fs/ext4/mballoc.h Normal file
View file

@ -0,0 +1,215 @@
/*
* fs/ext4/mballoc.h
*
* Written by: Alex Tomas <alex@clusterfs.com>
*
*/
#ifndef _EXT4_MBALLOC_H
#define _EXT4_MBALLOC_H
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/proc_fs.h>
#include <linux/pagemap.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
#include <linux/mutex.h>
#include "ext4_jbd2.h"
#include "ext4.h"
/*
* with AGGRESSIVE_CHECK allocator runs consistency checks over
* structures. these checks slow things down a lot
*/
#define AGGRESSIVE_CHECK__
/*
* with DOUBLE_CHECK defined mballoc creates persistent in-core
* bitmaps, maintains and uses them to check for double allocations
*/
#define DOUBLE_CHECK__
/*
*/
#ifdef CONFIG_EXT4_DEBUG
extern ushort ext4_mballoc_debug;
#define mb_debug(n, fmt, a...) \
do { \
if ((n) <= ext4_mballoc_debug) { \
printk(KERN_DEBUG "(%s, %d): %s: ", \
__FILE__, __LINE__, __func__); \
printk(fmt, ## a); \
} \
} while (0)
#else
#define mb_debug(n, fmt, a...) no_printk(fmt, ## a)
#endif
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
/*
* How long mballoc can look for a best extent (in found extents)
*/
#define MB_DEFAULT_MAX_TO_SCAN 200
/*
* How long mballoc must look for a best extent
*/
#define MB_DEFAULT_MIN_TO_SCAN 10
/*
* with 'ext4_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
#define MB_DEFAULT_STATS 0
/*
* files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
* by the stream allocator, which purpose is to pack requests
* as close each to other as possible to produce smooth I/O traffic
* We use locality group prealloc space for stream request.
* We can tune the same via /proc/fs/ext4/<parition>/stream_req
*/
#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
/*
* for which requests use 2^N search using buddies
*/
#define MB_DEFAULT_ORDER2_REQS 2
/*
* default group prealloc size 512 blocks
*/
#define MB_DEFAULT_GROUP_PREALLOC 512
struct ext4_free_data {
/* MUST be the first member */
struct ext4_journal_cb_entry efd_jce;
/* ext4_free_data private data starts from here */
/* this links the free block information from group_info */
struct rb_node efd_node;
/* group which free block extent belongs */
ext4_group_t efd_group;
/* free block extent */
ext4_grpblk_t efd_start_cluster;
ext4_grpblk_t efd_count;
/* transaction which freed this extent */
tid_t efd_tid;
};
struct ext4_prealloc_space {
struct list_head pa_inode_list;
struct list_head pa_group_list;
union {
struct list_head pa_tmp_list;
struct rcu_head pa_rcu;
} u;
spinlock_t pa_lock;
atomic_t pa_count;
unsigned pa_deleted;
ext4_fsblk_t pa_pstart; /* phys. block */
ext4_lblk_t pa_lstart; /* log. block */
ext4_grpblk_t pa_len; /* len of preallocated chunk */
ext4_grpblk_t pa_free; /* how many blocks are free */
unsigned short pa_type; /* pa type. inode or group */
spinlock_t *pa_obj_lock;
struct inode *pa_inode; /* hack, for history only */
};
enum {
MB_INODE_PA = 0,
MB_GROUP_PA = 1
};
struct ext4_free_extent {
ext4_lblk_t fe_logical;
ext4_grpblk_t fe_start; /* In cluster units */
ext4_group_t fe_group;
ext4_grpblk_t fe_len; /* In cluster units */
};
/*
* Locality group:
* we try to group all related changes together
* so that writeback can flush/allocate them together as well
* Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
* (512). We store prealloc space into the hash based on the pa_free blocks
* order value.ie, fls(pa_free)-1;
*/
#define PREALLOC_TB_SIZE 10
struct ext4_locality_group {
/* for allocator */
/* to serialize allocates */
struct mutex lg_mutex;
/* list of preallocations */
struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
spinlock_t lg_prealloc_lock;
};
struct ext4_allocation_context {
struct inode *ac_inode;
struct super_block *ac_sb;
/* original request */
struct ext4_free_extent ac_o_ex;
/* goal request (normalized ac_o_ex) */
struct ext4_free_extent ac_g_ex;
/* the best found extent */
struct ext4_free_extent ac_b_ex;
/* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
__u16 ac_groups_scanned;
__u16 ac_found;
__u16 ac_tail;
__u16 ac_buddy;
__u16 ac_flags; /* allocation hints */
__u8 ac_status;
__u8 ac_criteria;
__u8 ac_2order; /* if request is to allocate 2^N blocks and
* N > 0, the field stores N, otherwise 0 */
__u8 ac_op; /* operation, for history only */
struct page *ac_bitmap_page;
struct page *ac_buddy_page;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
};
#define AC_STATUS_CONTINUE 1
#define AC_STATUS_FOUND 2
#define AC_STATUS_BREAK 3
struct ext4_buddy {
struct page *bd_buddy_page;
void *bd_buddy;
struct page *bd_bitmap_page;
void *bd_bitmap;
struct ext4_group_info *bd_info;
struct super_block *bd_sb;
__u16 bd_blkbits;
ext4_group_t bd_group;
};
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
struct ext4_free_extent *fex)
{
return ext4_group_first_block_no(sb, fex->fe_group) +
(fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
}
#endif

672
fs/ext4/migrate.c Normal file
View file

@ -0,0 +1,672 @@
/*
* Copyright IBM Corporation, 2007
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
*/
#include <linux/slab.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
/*
* The contiguous blocks details which can be
* represented by a single extent
*/
struct migrate_struct {
ext4_lblk_t first_block, last_block, curr_block;
ext4_fsblk_t first_pblock, last_pblock;
};
static int finish_range(handle_t *handle, struct inode *inode,
struct migrate_struct *lb)
{
int retval = 0, needed;
struct ext4_extent newext;
struct ext4_ext_path *path;
if (lb->first_pblock == 0)
return 0;
/* Add the extent to temp inode*/
newext.ee_block = cpu_to_le32(lb->first_block);
newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
ext4_ext_store_pblock(&newext, lb->first_pblock);
/* Locking only for convinience since we are operating on temp inode */
down_write(&EXT4_I(inode)->i_data_sem);
path = ext4_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
retval = PTR_ERR(path);
path = NULL;
goto err_out;
}
/*
* Calculate the credit needed to inserting this extent
* Since we are doing this in loop we may accumalate extra
* credit. But below we try to not accumalate too much
* of them by restarting the journal.
*/
needed = ext4_ext_calc_credits_for_single_extent(inode,
lb->last_block - lb->first_block + 1, path);
/*
* Make sure the credit we accumalated is not really high
*/
if (needed && ext4_handle_has_enough_credits(handle,
EXT4_RESERVE_TRANS_BLOCKS)) {
up_write((&EXT4_I(inode)->i_data_sem));
retval = ext4_journal_restart(handle, needed);
down_write((&EXT4_I(inode)->i_data_sem));
if (retval)
goto err_out;
} else if (needed) {
retval = ext4_journal_extend(handle, needed);
if (retval) {
/*
* IF not able to extend the journal restart the journal
*/
up_write((&EXT4_I(inode)->i_data_sem));
retval = ext4_journal_restart(handle, needed);
down_write((&EXT4_I(inode)->i_data_sem));
if (retval)
goto err_out;
}
}
retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
ext4_ext_drop_refs(path);
kfree(path);
lb->first_pblock = 0;
return retval;
}
static int update_extent_range(handle_t *handle, struct inode *inode,
ext4_fsblk_t pblock, struct migrate_struct *lb)
{
int retval;
/*
* See if we can add on to the existing range (if it exists)
*/
if (lb->first_pblock &&
(lb->last_pblock+1 == pblock) &&
(lb->last_block+1 == lb->curr_block)) {
lb->last_pblock = pblock;
lb->last_block = lb->curr_block;
lb->curr_block++;
return 0;
}
/*
* Start a new range.
*/
retval = finish_range(handle, inode, lb);
lb->first_pblock = lb->last_pblock = pblock;
lb->first_block = lb->last_block = lb->curr_block;
lb->curr_block++;
return retval;
}
static int update_ind_extent_range(handle_t *handle, struct inode *inode,
ext4_fsblk_t pblock,
struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
i_data = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_extent_range(handle, inode,
le32_to_cpu(i_data[i]), lb);
if (retval)
break;
} else {
lb->curr_block++;
}
}
put_bh(bh);
return retval;
}
static int update_dind_extent_range(handle_t *handle, struct inode *inode,
ext4_fsblk_t pblock,
struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
i_data = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_ind_extent_range(handle, inode,
le32_to_cpu(i_data[i]), lb);
if (retval)
break;
} else {
/* Only update the file block number */
lb->curr_block += max_entries;
}
}
put_bh(bh);
return retval;
}
static int update_tind_extent_range(handle_t *handle, struct inode *inode,
ext4_fsblk_t pblock,
struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
i_data = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_dind_extent_range(handle, inode,
le32_to_cpu(i_data[i]), lb);
if (retval)
break;
} else {
/* Only update the file block number */
lb->curr_block += max_entries * max_entries;
}
}
put_bh(bh);
return retval;
}
static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
{
int retval = 0, needed;
if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
return 0;
/*
* We are freeing a blocks. During this we touch
* superblock, group descriptor and block bitmap.
* So allocate a credit of 3. We may update
* quota (user and group).
*/
needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
if (ext4_journal_extend(handle, needed) != 0)
retval = ext4_journal_restart(handle, needed);
return retval;
}
static int free_dind_blocks(handle_t *handle,
struct inode *inode, __le32 i_data)
{
int i;
__le32 *tmp_idata;
struct buffer_head *bh;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
if (!bh)
return -EIO;
tmp_idata = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) {
extend_credit_for_blkdel(handle, inode);
ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(tmp_idata[i]), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
}
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
return 0;
}
static int free_tind_blocks(handle_t *handle,
struct inode *inode, __le32 i_data)
{
int i, retval = 0;
__le32 *tmp_idata;
struct buffer_head *bh;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
if (!bh)
return -EIO;
tmp_idata = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) {
retval = free_dind_blocks(handle,
inode, tmp_idata[i]);
if (retval) {
put_bh(bh);
return retval;
}
}
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
return 0;
}
static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
{
int retval;
/* ei->i_data[EXT4_IND_BLOCK] */
if (i_data[0]) {
extend_credit_for_blkdel(handle, inode);
ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(i_data[0]), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
}
/* ei->i_data[EXT4_DIND_BLOCK] */
if (i_data[1]) {
retval = free_dind_blocks(handle, inode, i_data[1]);
if (retval)
return retval;
}
/* ei->i_data[EXT4_TIND_BLOCK] */
if (i_data[2]) {
retval = free_tind_blocks(handle, inode, i_data[2]);
if (retval)
return retval;
}
return 0;
}
static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
struct inode *tmp_inode)
{
int retval;
__le32 i_data[3];
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
/*
* One credit accounted for writing the
* i_data field of the original inode
*/
retval = ext4_journal_extend(handle, 1);
if (retval) {
retval = ext4_journal_restart(handle, 1);
if (retval)
goto err_out;
}
i_data[0] = ei->i_data[EXT4_IND_BLOCK];
i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
i_data[2] = ei->i_data[EXT4_TIND_BLOCK];
down_write(&EXT4_I(inode)->i_data_sem);
/*
* if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
* happened after we started the migrate. We need to
* fail the migrate
*/
if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
retval = -EAGAIN;
up_write(&EXT4_I(inode)->i_data_sem);
goto err_out;
} else
ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
/*
* We have the extent map build with the tmp inode.
* Now copy the i_data across
*/
ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
/*
* Update i_blocks with the new blocks that got
* allocated while adding extents for extent index
* blocks.
*
* While converting to extents we need not
* update the orignal inode i_blocks for extent blocks
* via quota APIs. The quota update happened via tmp_inode already.
*/
spin_lock(&inode->i_lock);
inode->i_blocks += tmp_inode->i_blocks;
spin_unlock(&inode->i_lock);
up_write(&EXT4_I(inode)->i_data_sem);
/*
* We mark the inode dirty after, because we decrement the
* i_blocks when freeing the indirect meta-data blocks
*/
retval = free_ind_block(handle, inode, i_data);
ext4_mark_inode_dirty(handle, inode);
err_out:
return retval;
}
static int free_ext_idx(handle_t *handle, struct inode *inode,
struct ext4_extent_idx *ix)
{
int i, retval = 0;
ext4_fsblk_t block;
struct buffer_head *bh;
struct ext4_extent_header *eh;
block = ext4_idx_pblock(ix);
bh = sb_bread(inode->i_sb, block);
if (!bh)
return -EIO;
eh = (struct ext4_extent_header *)bh->b_data;
if (eh->eh_depth != 0) {
ix = EXT_FIRST_INDEX(eh);
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
retval = free_ext_idx(handle, inode, ix);
if (retval)
break;
}
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
ext4_free_blocks(handle, inode, NULL, block, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
return retval;
}
/*
* Free the extent meta data blocks only
*/
static int free_ext_block(handle_t *handle, struct inode *inode)
{
int i, retval = 0;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data;
struct ext4_extent_idx *ix;
if (eh->eh_depth == 0)
/*
* No extra blocks allocated for extent meta data
*/
return 0;
ix = EXT_FIRST_INDEX(eh);
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
retval = free_ext_idx(handle, inode, ix);
if (retval)
return retval;
}
return retval;
}
int ext4_ext_migrate(struct inode *inode)
{
handle_t *handle;
int retval = 0, i;
__le32 *i_data;
struct ext4_inode_info *ei;
struct inode *tmp_inode = NULL;
struct migrate_struct lb;
unsigned long max_entries;
__u32 goal;
uid_t owner[2];
/*
* If the filesystem does not support extents, or the inode
* already is extent-based, error out.
*/
if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_INCOMPAT_EXTENTS) ||
(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
/*
* don't migrate fast symlink
*/
return retval;
/*
* Worst case we can touch the allocation bitmaps, a bgd
* block, and a block to link in the orphan list. We do need
* need to worry about credits for modifying the quota inode.
*/
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
return retval;
}
goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
owner[0] = i_uid_read(inode);
owner[1] = i_gid_read(inode);
tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
S_IFREG, NULL, goal, owner);
if (IS_ERR(tmp_inode)) {
retval = PTR_ERR(tmp_inode);
ext4_journal_stop(handle);
return retval;
}
i_size_write(tmp_inode, i_size_read(inode));
/*
* Set the i_nlink to zero so it will be deleted later
* when we drop inode reference.
*/
clear_nlink(tmp_inode);
ext4_ext_tree_init(handle, tmp_inode);
ext4_orphan_add(handle, tmp_inode);
ext4_journal_stop(handle);
/*
* start with one credit accounted for
* superblock modification.
*
* For the tmp_inode we already have committed the
* transaction that created the inode. Later as and
* when we add extents we extent the journal
*/
/*
* Even though we take i_mutex we can still cause block
* allocation via mmap write to holes. If we have allocated
* new blocks we fail migrate. New block allocation will
* clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
* with i_data_sem held to prevent racing with block
* allocation.
*/
down_read(&EXT4_I(inode)->i_data_sem);
ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
up_read((&EXT4_I(inode)->i_data_sem));
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
/*
* It is impossible to update on-disk structures without
* a handle, so just rollback in-core changes and live other
* work to orphan_list_cleanup()
*/
ext4_orphan_del(NULL, tmp_inode);
retval = PTR_ERR(handle);
goto out;
}
ei = EXT4_I(inode);
i_data = ei->i_data;
memset(&lb, 0, sizeof(lb));
/* 32 bit block address 4 bytes */
max_entries = inode->i_sb->s_blocksize >> 2;
for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
if (i_data[i]) {
retval = update_extent_range(handle, tmp_inode,
le32_to_cpu(i_data[i]), &lb);
if (retval)
goto err_out;
} else
lb.curr_block++;
}
if (i_data[EXT4_IND_BLOCK]) {
retval = update_ind_extent_range(handle, tmp_inode,
le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
if (retval)
goto err_out;
} else
lb.curr_block += max_entries;
if (i_data[EXT4_DIND_BLOCK]) {
retval = update_dind_extent_range(handle, tmp_inode,
le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
if (retval)
goto err_out;
} else
lb.curr_block += max_entries * max_entries;
if (i_data[EXT4_TIND_BLOCK]) {
retval = update_tind_extent_range(handle, tmp_inode,
le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
if (retval)
goto err_out;
}
/*
* Build the last extent
*/
retval = finish_range(handle, tmp_inode, &lb);
err_out:
if (retval)
/*
* Failure case delete the extent information with the
* tmp_inode
*/
free_ext_block(handle, tmp_inode);
else {
retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode);
if (retval)
/*
* if we fail to swap inode data free the extent
* details of the tmp inode
*/
free_ext_block(handle, tmp_inode);
}
/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
if (ext4_journal_extend(handle, 1) != 0)
ext4_journal_restart(handle, 1);
/*
* Mark the tmp_inode as of size zero
*/
i_size_write(tmp_inode, 0);
/*
* set the i_blocks count to zero
* so that the ext4_delete_inode does the
* right job
*
* We don't need to take the i_lock because
* the inode is not visible to user space.
*/
tmp_inode->i_blocks = 0;
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
ext4_journal_stop(handle);
out:
unlock_new_inode(tmp_inode);
iput(tmp_inode);
return retval;
}
/*
* Migrate a simple extent-based inode to use the i_blocks[] array
*/
int ext4_ind_migrate(struct inode *inode)
{
struct ext4_extent_header *eh;
struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_extent *ex;
unsigned int i, len;
ext4_fsblk_t blk;
handle_t *handle;
int ret;
if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_INCOMPAT_EXTENTS) ||
(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC))
return -EOPNOTSUPP;
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_ext_check_inode(inode);
if (ret)
goto errout;
eh = ext_inode_hdr(inode);
ex = EXT_FIRST_EXTENT(eh);
if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
ret = -EOPNOTSUPP;
goto errout;
}
if (eh->eh_entries == 0)
blk = len = 0;
else {
len = le16_to_cpu(ex->ee_len);
blk = ext4_ext_pblock(ex);
if (len > EXT4_NDIR_BLOCKS) {
ret = -EOPNOTSUPP;
goto errout;
}
}
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
memset(ei->i_data, 0, sizeof(ei->i_data));
for (i=0; i < len; i++)
ei->i_data[i] = cpu_to_le32(blk++);
ext4_mark_inode_dirty(handle, inode);
errout:
ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
return ret;
}

393
fs/ext4/mmp.c Normal file
View file

@ -0,0 +1,393 @@
#include <linux/fs.h>
#include <linux/random.h>
#include <linux/buffer_head.h>
#include <linux/utsname.h>
#include <linux/kthread.h>
#include "ext4.h"
/* Checksumming functions */
static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct mmp_struct, mmp_checksum);
__u32 csum;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
return cpu_to_le32(csum);
}
static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
if (!ext4_has_metadata_csum(sb))
return 1;
return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
}
static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
if (!ext4_has_metadata_csum(sb))
return;
mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
}
/*
* Write the MMP block using WRITE_SYNC to try to get the block on-disk
* faster.
*/
static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
{
struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
/*
* We protect against freezing so that we don't create dirty buffers
* on frozen filesystem.
*/
sb_start_write(sb);
ext4_mmp_csum_set(sb, mmp);
mark_buffer_dirty(bh);
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
return 1;
return 0;
}
/*
* Read the MMP block. It _must_ be read from disk and hence we clear the
* uptodate flag on the buffer.
*/
static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
ext4_fsblk_t mmp_block)
{
struct mmp_struct *mmp;
if (*bh)
clear_buffer_uptodate(*bh);
/* This would be sb_bread(sb, mmp_block), except we need to be sure
* that the MD RAID device cache has been bypassed, and that the read
* is not blocked in the elevator. */
if (!*bh)
*bh = sb_getblk(sb, mmp_block);
if (!*bh)
return -ENOMEM;
if (*bh) {
get_bh(*bh);
lock_buffer(*bh);
(*bh)->b_end_io = end_buffer_read_sync;
submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
brelse(*bh);
*bh = NULL;
}
}
if (unlikely(!*bh)) {
ext4_warning(sb, "Error while reading MMP block %llu",
mmp_block);
return -EIO;
}
mmp = (struct mmp_struct *)((*bh)->b_data);
if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC ||
!ext4_mmp_csum_verify(sb, mmp))
return -EINVAL;
return 0;
}
/*
* Dump as much information as possible to help the admin.
*/
void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
const char *function, unsigned int line, const char *msg)
{
__ext4_warning(sb, function, line, msg);
__ext4_warning(sb, function, line,
"MMP failure info: last update time: %llu, last update "
"node: %s, last update device: %s\n",
(long long unsigned int) le64_to_cpu(mmp->mmp_time),
mmp->mmp_nodename, mmp->mmp_bdevname);
}
/*
* kmmpd will update the MMP sequence every s_mmp_update_interval seconds
*/
static int kmmpd(void *data)
{
struct super_block *sb = ((struct mmpd_data *) data)->sb;
struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct mmp_struct *mmp;
ext4_fsblk_t mmp_block;
u32 seq = 0;
unsigned long failed_writes = 0;
int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
unsigned mmp_check_interval;
unsigned long last_update_time;
unsigned long diff;
int retval;
mmp_block = le64_to_cpu(es->s_mmp_block);
mmp = (struct mmp_struct *)(bh->b_data);
mmp->mmp_time = cpu_to_le64(get_seconds());
/*
* Start with the higher mmp_check_interval and reduce it if
* the MMP block is being updated on time.
*/
mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
bdevname(bh->b_bdev, mmp->mmp_bdevname);
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
while (!kthread_should_stop()) {
if (++seq > EXT4_MMP_SEQ_MAX)
seq = 1;
mmp->mmp_seq = cpu_to_le32(seq);
mmp->mmp_time = cpu_to_le64(get_seconds());
last_update_time = jiffies;
retval = write_mmp_block(sb, bh);
/*
* Don't spew too many error messages. Print one every
* (s_mmp_update_interval * 60) seconds.
*/
if (retval) {
if ((failed_writes % 60) == 0)
ext4_error(sb, "Error writing to MMP block");
failed_writes++;
}
if (!(le32_to_cpu(es->s_feature_incompat) &
EXT4_FEATURE_INCOMPAT_MMP)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");
EXT4_SB(sb)->s_mmp_tsk = NULL;
goto failed;
}
if (sb->s_flags & MS_RDONLY) {
ext4_warning(sb, "kmmpd being stopped since filesystem "
"has been remounted as readonly.");
EXT4_SB(sb)->s_mmp_tsk = NULL;
goto failed;
}
diff = jiffies - last_update_time;
if (diff < mmp_update_interval * HZ)
schedule_timeout_interruptible(mmp_update_interval *
HZ - diff);
/*
* We need to make sure that more than mmp_check_interval
* seconds have not passed since writing. If that has happened
* we need to check if the MMP block is as we left it.
*/
diff = jiffies - last_update_time;
if (diff > mmp_check_interval * HZ) {
struct buffer_head *bh_check = NULL;
struct mmp_struct *mmp_check;
retval = read_mmp_block(sb, &bh_check, mmp_block);
if (retval) {
ext4_error(sb, "error reading MMP data: %d",
retval);
EXT4_SB(sb)->s_mmp_tsk = NULL;
goto failed;
}
mmp_check = (struct mmp_struct *)(bh_check->b_data);
if (mmp->mmp_seq != mmp_check->mmp_seq ||
memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
sizeof(mmp->mmp_nodename))) {
dump_mmp_msg(sb, mmp_check,
"Error while updating MMP info. "
"The filesystem seems to have been"
" multiply mounted.");
ext4_error(sb, "abort");
goto failed;
}
put_bh(bh_check);
}
/*
* Adjust the mmp_check_interval depending on how much time
* it took for the MMP block to be written.
*/
mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
EXT4_MMP_MAX_CHECK_INTERVAL),
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
}
/*
* Unmount seems to be clean.
*/
mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
mmp->mmp_time = cpu_to_le64(get_seconds());
retval = write_mmp_block(sb, bh);
failed:
kfree(data);
brelse(bh);
return retval;
}
/*
* Get a random new sequence number but make sure it is not greater than
* EXT4_MMP_SEQ_MAX.
*/
static unsigned int mmp_new_seq(void)
{
u32 new_seq;
do {
new_seq = prandom_u32();
} while (new_seq > EXT4_MMP_SEQ_MAX);
return new_seq;
}
/*
* Protect the filesystem from being mounted more than once.
*/
int ext4_multi_mount_protect(struct super_block *sb,
ext4_fsblk_t mmp_block)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *bh = NULL;
struct mmp_struct *mmp = NULL;
struct mmpd_data *mmpd_data;
u32 seq;
unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
unsigned int wait_time = 0;
int retval;
if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
mmp_block >= ext4_blocks_count(es)) {
ext4_warning(sb, "Invalid MMP block in superblock");
goto failed;
}
retval = read_mmp_block(sb, &bh, mmp_block);
if (retval)
goto failed;
mmp = (struct mmp_struct *)(bh->b_data);
if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
/*
* If check_interval in MMP block is larger, use that instead of
* update_interval from the superblock.
*/
if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
seq = le32_to_cpu(mmp->mmp_seq);
if (seq == EXT4_MMP_SEQ_CLEAN)
goto skip;
if (seq == EXT4_MMP_SEQ_FSCK) {
dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
goto failed;
}
wait_time = min(mmp_check_interval * 2 + 1,
mmp_check_interval + 60);
/* Print MMP interval if more than 20 secs. */
if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
ext4_warning(sb, "MMP interval %u higher than expected, please"
" wait.\n", wait_time * 2);
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
ext4_warning(sb, "MMP startup interrupted, failing mount\n");
goto failed;
}
retval = read_mmp_block(sb, &bh, mmp_block);
if (retval)
goto failed;
mmp = (struct mmp_struct *)(bh->b_data);
if (seq != le32_to_cpu(mmp->mmp_seq)) {
dump_mmp_msg(sb, mmp,
"Device is already active on another node.");
goto failed;
}
skip:
/*
* write a new random sequence number.
*/
seq = mmp_new_seq();
mmp->mmp_seq = cpu_to_le32(seq);
retval = write_mmp_block(sb, bh);
if (retval)
goto failed;
/*
* wait for MMP interval and check mmp_seq.
*/
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
ext4_warning(sb, "MMP startup interrupted, failing mount\n");
goto failed;
}
retval = read_mmp_block(sb, &bh, mmp_block);
if (retval)
goto failed;
mmp = (struct mmp_struct *)(bh->b_data);
if (seq != le32_to_cpu(mmp->mmp_seq)) {
dump_mmp_msg(sb, mmp,
"Device is already active on another node.");
goto failed;
}
mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
if (!mmpd_data) {
ext4_warning(sb, "not enough memory for mmpd_data");
goto failed;
}
mmpd_data->sb = sb;
mmpd_data->bh = bh;
/*
* Start a kernel thread to update the MMP block periodically.
*/
EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
bdevname(bh->b_bdev,
mmp->mmp_bdevname));
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
EXT4_SB(sb)->s_mmp_tsk = NULL;
kfree(mmpd_data);
ext4_warning(sb, "Unable to create kmmpd thread for %s.",
sb->s_id);
goto failed;
}
return 0;
failed:
brelse(bh);
return 1;
}

684
fs/ext4/move_extent.c Normal file
View file

@ -0,0 +1,684 @@
/*
* Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
* Written by Takashi Sato <t-sato@yk.jp.nec.com>
* Akira Fujita <a-fujita@rs.jp.nec.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/slab.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "ext4_extents.h"
/**
* get_ext_path - Find an extent path for designated logical block number.
*
* @inode: an inode which is searched
* @lblock: logical block number to find an extent path
* @path: pointer to an extent path pointer (for output)
*
* ext4_find_extent wrapper. Return 0 on success, or a negative error value
* on failure.
*/
static inline int
get_ext_path(struct inode *inode, ext4_lblk_t lblock,
struct ext4_ext_path **ppath)
{
struct ext4_ext_path *path;
path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
if (IS_ERR(path))
return PTR_ERR(path);
if (path[ext_depth(inode)].p_ext == NULL) {
ext4_ext_drop_refs(path);
kfree(path);
*ppath = NULL;
return -ENODATA;
}
*ppath = path;
return 0;
}
/**
* ext4_double_down_write_data_sem - Acquire two inodes' write lock
* of i_data_sem
*
* Acquire write lock of i_data_sem of the two inodes
*/
void
ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
if (first < second) {
down_write(&EXT4_I(first)->i_data_sem);
down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
} else {
down_write(&EXT4_I(second)->i_data_sem);
down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
}
}
/**
* ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
* Release write lock of i_data_sem of two inodes (orig and donor).
*/
void
ext4_double_up_write_data_sem(struct inode *orig_inode,
struct inode *donor_inode)
{
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
}
/**
* mext_check_coverage - Check that all extents in range has the same type
*
* @inode: inode in question
* @from: block offset of inode
* @count: block count to be checked
* @unwritten: extents expected to be unwritten
* @err: pointer to save error value
*
* Return 1 if all extents in range has expected type, and zero otherwise.
*/
static int
mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
int unwritten, int *err)
{
struct ext4_ext_path *path = NULL;
struct ext4_extent *ext;
int ret = 0;
ext4_lblk_t last = from + count;
while (from < last) {
*err = get_ext_path(inode, from, &path);
if (*err)
goto out;
ext = path[ext_depth(inode)].p_ext;
if (unwritten != ext4_ext_is_unwritten(ext))
goto out;
from += ext4_ext_get_actual_len(ext);
ext4_ext_drop_refs(path);
}
ret = 1;
out:
ext4_ext_drop_refs(path);
kfree(path);
return ret;
}
/**
* mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
*
* @inode1: the inode structure
* @inode2: the inode structure
* @index1: page index
* @index2: page index
* @page: result page vector
*
* Grab two locked pages for inode's by inode order
*/
static int
mext_page_double_lock(struct inode *inode1, struct inode *inode2,
pgoff_t index1, pgoff_t index2, struct page *page[2])
{
struct address_space *mapping[2];
unsigned fl = AOP_FLAG_NOFS;
BUG_ON(!inode1 || !inode2);
if (inode1 < inode2) {
mapping[0] = inode1->i_mapping;
mapping[1] = inode2->i_mapping;
} else {
pgoff_t tmp = index1;
index1 = index2;
index2 = tmp;
mapping[0] = inode2->i_mapping;
mapping[1] = inode1->i_mapping;
}
page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
if (!page[0])
return -ENOMEM;
page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
if (!page[1]) {
unlock_page(page[0]);
page_cache_release(page[0]);
return -ENOMEM;
}
/*
* grab_cache_page_write_begin() may not wait on page's writeback if
* BDI not demand that. But it is reasonable to be very conservative
* here and explicitly wait on page's writeback
*/
wait_on_page_writeback(page[0]);
wait_on_page_writeback(page[1]);
if (inode1 > inode2) {
struct page *tmp;
tmp = page[0];
page[0] = page[1];
page[1] = tmp;
}
return 0;
}
/* Force page buffers uptodate w/o dropping page's lock */
static int
mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
sector_t block;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize, block_start, block_end;
int i, err, nr = 0, partial = 0;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
if (PageUptodate(page))
return 0;
blocksize = 1 << inode->i_blkbits;
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);
block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
for (bh = head, block_start = 0; bh != head || !block_start;
block++, block_start = block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh))
partial = 1;
continue;
}
if (buffer_uptodate(bh))
continue;
if (!buffer_mapped(bh)) {
err = ext4_get_block(inode, block, bh, 0);
if (err) {
SetPageError(page);
return err;
}
if (!buffer_mapped(bh)) {
zero_user(page, block_start, blocksize);
set_buffer_uptodate(bh);
continue;
}
}
BUG_ON(nr >= MAX_BUF_PER_PAGE);
arr[nr++] = bh;
}
/* No io required */
if (!nr)
goto out;
for (i = 0; i < nr; i++) {
bh = arr[i];
if (!bh_uptodate_or_lock(bh)) {
err = bh_submit_read(bh);
if (err)
return err;
}
}
out:
if (!partial)
SetPageUptodate(page);
return 0;
}
/**
* move_extent_per_page - Move extent data per page
*
* @o_filp: file structure of original file
* @donor_inode: donor inode
* @orig_page_offset: page index on original file
* @donor_page_offset: page index on donor file
* @data_offset_in_page: block index where data swapping starts
* @block_len_in_page: the number of blocks to be swapped
* @unwritten: orig extent is unwritten or not
* @err: pointer to save return value
*
* Save the data in original inode blocks and replace original inode extents
* with donor inode extents by calling ext4_swap_extents().
* Finally, write out the saved data in new original inode blocks. Return
* replaced block count.
*/
static int
move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, pgoff_t donor_page_offset,
int data_offset_in_page,
int block_len_in_page, int unwritten, int *err)
{
struct inode *orig_inode = file_inode(o_filp);
struct page *pagep[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset, donor_blk_offset;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
unsigned int tmp_data_size, data_size, replaced_size;
int err2, jblocks, retries = 0;
int replaced_count = 0;
int from = data_offset_in_page << orig_inode->i_blkbits;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
/*
* It needs twice the amount of ordinary journal buffers because
* inode and donor_inode may change each different metadata blocks.
*/
again:
*err = 0;
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
if (IS_ERR(handle)) {
*err = PTR_ERR(handle);
return 0;
}
if (segment_eq(get_fs(), KERNEL_DS))
w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
orig_blk_offset = orig_page_offset * blocks_per_page +
data_offset_in_page;
donor_blk_offset = donor_page_offset * blocks_per_page +
data_offset_in_page;
/* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
/* Replace the last block */
tmp_data_size = orig_inode->i_size & (blocksize - 1);
/*
* If data_size equal zero, it shows data_size is multiples of
* blocksize. So we set appropriate value.
*/
if (tmp_data_size == 0)
tmp_data_size = blocksize;
data_size = tmp_data_size +
((block_len_in_page - 1) << orig_inode->i_blkbits);
} else
data_size = block_len_in_page << orig_inode->i_blkbits;
replaced_size = data_size;
*err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
donor_page_offset, pagep);
if (unlikely(*err < 0))
goto stop_journal;
/*
* If orig extent was unwritten it can become initialized
* at any time after i_data_sem was dropped, in order to
* serialize with delalloc we have recheck extent while we
* hold page's lock, if it is still the case data copy is not
* necessary, just swap data blocks between orig and donor.
*/
if (unwritten) {
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
* fallback to data copying */
unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
block_len_in_page, 1, err);
if (*err)
goto drop_data_sem;
unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
block_len_in_page, 1, err);
if (*err)
goto drop_data_sem;
if (!unwritten) {
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
if ((page_has_private(pagep[0]) &&
!try_to_release_page(pagep[0], 0)) ||
(page_has_private(pagep[1]) &&
!try_to_release_page(pagep[1], 0))) {
*err = -EBUSY;
goto drop_data_sem;
}
replaced_count = ext4_swap_extents(handle, orig_inode,
donor_inode, orig_blk_offset,
donor_blk_offset,
block_len_in_page, 1, err);
drop_data_sem:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto unlock_pages;
}
data_copy:
*err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
if (*err)
goto unlock_pages;
/* At this point all buffers in range are uptodate, old mapping layout
* is no longer required, try to drop it now. */
if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
(page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
*err = -EBUSY;
goto unlock_pages;
}
ext4_double_down_write_data_sem(orig_inode, donor_inode);
replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
orig_blk_offset, donor_blk_offset,
block_len_in_page, 1, err);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (*err) {
if (replaced_count) {
block_len_in_page = replaced_count;
replaced_size =
block_len_in_page << orig_inode->i_blkbits;
} else
goto unlock_pages;
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
*err = __block_write_begin(pagep[0], from, replaced_size,
ext4_get_block);
if (!*err)
*err = block_commit_write(pagep[0], from, from + replaced_size);
if (unlikely(*err < 0))
goto repair_branches;
/* Even in case of data=writeback it is reasonable to pin
* inode to transaction, to prevent unexpected data loss */
*err = ext4_jbd2_file_inode(handle, orig_inode);
unlock_pages:
unlock_page(pagep[0]);
page_cache_release(pagep[0]);
unlock_page(pagep[1]);
page_cache_release(pagep[1]);
stop_journal:
ext4_journal_stop(handle);
/* Buffer was busy because probably is pinned to journal transaction,
* force transaction commit may help to free it. */
if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
&retries))
goto again;
return replaced_count;
repair_branches:
/*
* This should never ever happen!
* Extents are swapped already, but we are not able to copy data.
* Try to swap extents to it's original places
*/
ext4_double_down_write_data_sem(orig_inode, donor_inode);
replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
orig_blk_offset, donor_blk_offset,
block_len_in_page, 0, &err2);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (replaced_count != block_len_in_page) {
EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
"Unable to copy data block,"
" data will be lost.");
*err = -EIO;
}
replaced_count = 0;
goto unlock_pages;
}
/**
* mext_check_arguments - Check whether move extent can be done
*
* @orig_inode: original inode
* @donor_inode: donor inode
* @orig_start: logical start offset in block for orig
* @donor_start: logical start offset in block for donor
* @len: the number of blocks to be moved
*
* Check the arguments of ext4_move_extents() whether the files can be
* exchanged with each other.
* Return 0 on success, or a negative error value on failure.
*/
static int
mext_check_arguments(struct inode *orig_inode,
struct inode *donor_inode, __u64 orig_start,
__u64 donor_start, __u64 *len)
{
__u64 orig_eof, donor_eof;
unsigned int blkbits = orig_inode->i_blkbits;
unsigned int blocksize = 1 << blkbits;
orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
ext4_debug("ext4 move extent: suid or sgid is set"
" to donor file [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
return -EPERM;
/* Ext4 move extent does not support swapfile */
if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
ext4_debug("ext4 move extent: The argument files should "
"not be swapfile [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EBUSY;
}
/* Ext4 move extent supports only extent based file */
if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
"based file [ino:orig %lu]\n", orig_inode->i_ino);
return -EOPNOTSUPP;
} else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: donor file is not extents "
"based file [ino:donor %lu]\n", donor_inode->i_ino);
return -EOPNOTSUPP;
}
if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
ext4_debug("ext4 move extent: File size is 0 byte\n");
return -EINVAL;
}
/* Start offset should be same */
if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
(donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
ext4_debug("ext4 move extent: orig and donor's start "
"offset are not alligned [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
if ((orig_start >= EXT_MAX_BLOCKS) ||
(donor_start >= EXT_MAX_BLOCKS) ||
(*len > EXT_MAX_BLOCKS) ||
(donor_start + *len >= EXT_MAX_BLOCKS) ||
(orig_start + *len >= EXT_MAX_BLOCKS)) {
ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
if (orig_eof < orig_start + *len - 1)
*len = orig_eof - orig_start;
if (donor_eof < donor_start + *len - 1)
*len = donor_eof - donor_start;
if (!*len) {
ext4_debug("ext4 move extent: len should not be 0 "
"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
donor_inode->i_ino);
return -EINVAL;
}
return 0;
}
/**
* ext4_move_extents - Exchange the specified range of a file
*
* @o_filp: file structure of the original file
* @d_filp: file structure of the donor file
* @orig_blk: start offset in block for orig
* @donor_blk: start offset in block for donor
* @len: the number of blocks to be moved
* @moved_len: moved block length
*
* This function returns 0 and moved block length is set in moved_len
* if succeed, otherwise returns error value.
*
*/
int
ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
__u64 donor_blk, __u64 len, __u64 *moved_len)
{
struct inode *orig_inode = file_inode(o_filp);
struct inode *donor_inode = file_inode(d_filp);
struct ext4_ext_path *path = NULL;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
ext4_lblk_t o_end, o_start = orig_blk;
ext4_lblk_t d_start = donor_blk;
int ret;
if (orig_inode->i_sb != donor_inode->i_sb) {
ext4_debug("ext4 move extent: The argument files "
"should be in same FS [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
/* orig and donor should be different inodes */
if (orig_inode == donor_inode) {
ext4_debug("ext4 move extent: The argument files should not "
"be same inode [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
/* Regular file check */
if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
ext4_debug("ext4 move extent: The argument files should be "
"regular file [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
/* TODO: This is non obvious task to swap blocks for inodes with full
jornaling enabled */
if (ext4_should_journal_data(orig_inode) ||
ext4_should_journal_data(donor_inode)) {
return -EINVAL;
}
/* Protect orig and donor inodes against a truncate */
lock_two_nondirectories(orig_inode, donor_inode);
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(orig_inode);
ext4_inode_block_unlocked_dio(donor_inode);
inode_dio_wait(orig_inode);
inode_dio_wait(donor_inode);
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
donor_blk, &len);
if (ret)
goto out;
o_end = o_start + len;
while (o_start < o_end) {
struct ext4_extent *ex;
ext4_lblk_t cur_blk, next_blk;
pgoff_t orig_page_index, donor_page_index;
int offset_in_page;
int unwritten, cur_len;
ret = get_ext_path(orig_inode, o_start, &path);
if (ret)
goto out;
ex = path[path->p_depth].p_ext;
next_blk = ext4_ext_next_allocated_block(path);
cur_blk = le32_to_cpu(ex->ee_block);
cur_len = ext4_ext_get_actual_len(ex);
/* Check hole before the start pos */
if (cur_blk + cur_len - 1 < o_start) {
if (next_blk == EXT_MAX_BLOCKS) {
o_start = o_end;
ret = -ENODATA;
goto out;
}
d_start += next_blk - o_start;
o_start = next_blk;
continue;
/* Check hole after the start pos */
} else if (cur_blk > o_start) {
/* Skip hole */
d_start += cur_blk - o_start;
o_start = cur_blk;
/* Extent inside requested range ?*/
if (cur_blk >= o_end)
goto out;
} else { /* in_range(o_start, o_blk, o_len) */
cur_len += cur_blk - o_start;
}
unwritten = ext4_ext_is_unwritten(ex);
if (o_end - o_start < cur_len)
cur_len = o_end - o_start;
orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
orig_inode->i_blkbits);
donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
donor_inode->i_blkbits);
offset_in_page = o_start % blocks_per_page;
if (cur_len > blocks_per_page- offset_in_page)
cur_len = blocks_per_page - offset_in_page;
/*
* Up semaphore to avoid following problems:
* a. transaction deadlock among ext4_journal_start,
* ->write_begin via pagefault, and jbd2_journal_commit
* b. racing with ->readpage, ->write_begin, and ext4_get_block
* in move_extent_per_page
*/
ext4_double_up_write_data_sem(orig_inode, donor_inode);
/* Swap original branches with new branches */
move_extent_per_page(o_filp, donor_inode,
orig_page_index, donor_page_index,
offset_in_page, cur_len,
unwritten, &ret);
ext4_double_down_write_data_sem(orig_inode, donor_inode);
if (ret < 0)
break;
o_start += cur_len;
d_start += cur_len;
}
*moved_len = o_start - orig_blk;
if (*moved_len > len)
*moved_len = len;
out:
if (*moved_len) {
ext4_discard_preallocations(orig_inode);
ext4_discard_preallocations(donor_inode);
}
ext4_ext_drop_refs(path);
kfree(path);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
ext4_inode_resume_unlocked_dio(orig_inode);
ext4_inode_resume_unlocked_dio(donor_inode);
unlock_two_nondirectories(orig_inode, donor_inode);
return ret;
}

3612
fs/ext4/namei.c Normal file

File diff suppressed because it is too large Load diff

497
fs/ext4/page-io.c Normal file
View file

@ -0,0 +1,497 @@
/*
* linux/fs/ext4/page-io.c
*
* This contains the new page_io functions for ext4
*
* Written by Theodore Ts'o, 2010.
*/
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
#include <linux/aio.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/ratelimit.h>
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
static struct kmem_cache *io_end_cachep;
int __init ext4_init_pageio(void)
{
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
if (io_end_cachep == NULL)
return -ENOMEM;
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
}
/*
* Print an buffer I/O error compatible with the fs/buffer.c. This
* provides compatibility with dmesg scrapers that look for a specific
* buffer I/O error message. We really need a unified error reporting
* structure to userspace ala Digital Unix's uerf system, but it's
* probably not going to happen in my lifetime, due to LKML politics...
*/
static void buffer_io_error(struct buffer_head *bh)
{
char b[BDEVNAME_SIZE];
printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
bdevname(bh->b_bdev, b),
(unsigned long long)bh->b_blocknr);
}
static void ext4_finish_bio(struct bio *bio)
{
int i;
int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec;
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
struct buffer_head *bh, *head;
unsigned bio_start = bvec->bv_offset;
unsigned bio_end = bio_start + bvec->bv_len;
unsigned under_io = 0;
unsigned long flags;
if (!page)
continue;
if (error) {
SetPageError(page);
set_bit(AS_EIO, &page->mapping->flags);
}
bh = head = page_buffers(page);
/*
* We check all buffers in the page under BH_Uptodate_Lock
* to avoid races with other end io clearing async_write flags
*/
local_irq_save(flags);
bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
do {
if (bh_offset(bh) < bio_start ||
bh_offset(bh) + bh->b_size > bio_end) {
if (buffer_async_write(bh))
under_io++;
continue;
}
clear_buffer_async_write(bh);
if (error)
buffer_io_error(bh);
} while ((bh = bh->b_this_page) != head);
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
if (!under_io)
end_page_writeback(page);
}
}
static void ext4_release_io_end(ext4_io_end_t *io_end)
{
struct bio *bio, *next_bio;
BUG_ON(!list_empty(&io_end->list));
BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
WARN_ON(io_end->handle);
if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
wake_up_all(ext4_ioend_wq(io_end->inode));
for (bio = io_end->bio; bio; bio = next_bio) {
next_bio = bio->bi_private;
ext4_finish_bio(bio);
bio_put(bio);
}
kmem_cache_free(io_end_cachep, io_end);
}
static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(inode));
}
/*
* Check a range of space and convert unwritten extents to written. Note that
* we are protected from truncate touching same part of extent tree by the
* fact that truncate code waits for all DIO to finish (thus exclusion from
* direct IO is achieved) and also waits for PageWriteback bits. Thus we
* cannot get to ext4_ext_truncate() before all IOs overlapping that range are
* completed (happens from ext4_free_ioend()).
*/
static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
ssize_t size = io->size;
handle_t *handle = io->handle;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
io->handle = NULL; /* Following call will use up the handle */
ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
if (ret < 0) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
ext4_clear_io_unwritten_flag(io);
ext4_release_io_end(io);
return ret;
}
static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
if (list_empty(head))
return;
ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
list_for_each_entry(io, head, list) {
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
after = cur->next;
io1 = container_of(after, ext4_io_end_t, list);
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
io, inode->i_ino, io0, io1);
}
#endif
}
/* Add the io_end to per-inode completed end_io list. */
static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
struct ext4_inode_info *ei = EXT4_I(io_end->inode);
struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
struct workqueue_struct *wq;
unsigned long flags;
/* Only reserved conversions from writeback should enter here */
WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
WARN_ON(!io_end->handle && sbi->s_journal);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
wq = sbi->rsv_conversion_wq;
if (list_empty(&ei->i_rsv_conversion_list))
queue_work(wq, &ei->i_rsv_conversion_work);
list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
static int ext4_do_flush_completed_IO(struct inode *inode,
struct list_head *head)
{
ext4_io_end_t *io;
struct list_head unwritten;
unsigned long flags;
struct ext4_inode_info *ei = EXT4_I(inode);
int err, ret = 0;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
dump_completed_IO(inode, head);
list_replace_init(head, &unwritten);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
while (!list_empty(&unwritten)) {
io = list_entry(unwritten.next, ext4_io_end_t, list);
BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
list_del_init(&io->list);
err = ext4_end_io(io);
if (unlikely(!ret && err))
ret = err;
}
return ret;
}
/*
* work on completed IO, to convert unwritten extents to extents
*/
void ext4_end_io_rsv_work(struct work_struct *work)
{
struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
i_rsv_conversion_work);
ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
if (io) {
atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
INIT_LIST_HEAD(&io->list);
atomic_set(&io->count, 1);
}
return io;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_end->count)) {
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
ext4_release_io_end(io_end);
return;
}
ext4_add_complete_io(io_end);
}
}
int ext4_put_io_end(ext4_io_end_t *io_end)
{
int err = 0;
if (atomic_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
err = ext4_convert_unwritten_extents(io_end->handle,
io_end->inode, io_end->offset,
io_end->size);
io_end->handle = NULL;
ext4_clear_io_unwritten_flag(io_end);
}
ext4_release_io_end(io_end);
}
return err;
}
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
{
atomic_inc(&io_end->count);
return io_end;
}
/* BIO completion function for page writeback */
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
BUG_ON(!io_end);
bio->bi_end_io = NULL;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
if (error) {
struct inode *inode = io_end->inode;
ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
error, inode->i_ino,
(unsigned long long) io_end->offset,
(long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
mapping_set_error(inode->i_mapping, error);
}
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
/*
* Link bio into list hanging from io_end. We have to do it
* atomically as bio completions can be racing against each
* other.
*/
bio->bi_private = xchg(&io_end->bio, bio);
ext4_put_io_end_defer(io_end);
} else {
/*
* Drop io_end reference early. Inode can get freed once
* we finish the bio.
*/
ext4_put_io_end_defer(io_end);
ext4_finish_bio(bio);
bio_put(bio);
}
}
void ext4_io_submit(struct ext4_io_submit *io)
{
struct bio *bio = io->io_bio;
if (bio) {
bio_get(io->io_bio);
submit_bio(io->io_op, io->io_bio);
BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
bio_put(io->io_bio);
}
io->io_bio = NULL;
}
void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc)
{
io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
io->io_bio = NULL;
io->io_end = NULL;
}
static int io_submit_init_bio(struct ext4_io_submit *io,
struct buffer_head *bh)
{
int nvecs = bio_get_nr_vecs(bh->b_bdev);
struct bio *bio;
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
if (!bio)
return -ENOMEM;
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
return 0;
}
static int io_submit_add_bh(struct ext4_io_submit *io,
struct inode *inode,
struct buffer_head *bh)
{
int ret;
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
}
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
io->io_next_block++;
return 0;
}
int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
struct writeback_control *wbc,
bool keep_towrite)
{
struct inode *inode = page->mapping->host;
unsigned block_start, blocksize;
struct buffer_head *bh, *head;
int ret = 0;
int nr_submitted = 0;
blocksize = 1 << inode->i_blkbits;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
if (keep_towrite)
set_page_writeback_keepwrite(page);
else
set_page_writeback(page);
ClearPageError(page);
/*
* Comments copied from block_write_full_page:
*
* The page straddles i_size. It must be zeroed out on each and every
* writepage invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
if (len < PAGE_CACHE_SIZE)
zero_user_segment(page, len, PAGE_CACHE_SIZE);
/*
* In the first loop we prepare and mark buffers to submit. We have to
* mark all buffers in the page before submitting so that
* end_page_writeback() cannot be called from ext4_bio_end_io() when IO
* on the first buffer finishes and we are still working on submitting
* the second buffer.
*/
bh = head = page_buffers(page);
do {
block_start = bh_offset(bh);
if (block_start >= len) {
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
continue;
}
if (!buffer_dirty(bh) || buffer_delay(bh) ||
!buffer_mapped(bh) || buffer_unwritten(bh)) {
/* A hole? We can safely clear the dirty bit */
if (!buffer_mapped(bh))
clear_buffer_dirty(bh);
if (io->io_bio)
ext4_io_submit(io);
continue;
}
if (buffer_new(bh)) {
clear_buffer_new(bh);
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
set_buffer_async_write(bh);
} while ((bh = bh->b_this_page) != head);
/* Now submit buffers to write */
bh = head = page_buffers(page);
do {
if (!buffer_async_write(bh))
continue;
ret = io_submit_add_bh(io, inode, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
* we can do but mark the page as dirty, and
* better luck next time.
*/
redirty_page_for_writepage(wbc, page);
break;
}
nr_submitted++;
clear_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
/* Error stopped previous loop? Clean up buffers... */
if (ret) {
do {
clear_buffer_async_write(bh);
bh = bh->b_this_page;
} while (bh != head);
}
unlock_page(page);
/* Nothing submitted - we have to end page writeback */
if (!nr_submitted)
end_page_writeback(page);
return ret;
}

2021
fs/ext4/resize.c Normal file

File diff suppressed because it is too large Load diff

5791
fs/ext4/super.c Normal file

File diff suppressed because it is too large Load diff

52
fs/ext4/symlink.c Normal file
View file

@ -0,0 +1,52 @@
/*
* linux/fs/ext4/symlink.c
*
* Only fast symlinks left here - the rest is done by generic code. AV, 1999
*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
*
* from
*
* linux/fs/minix/symlink.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* ext4 symlink handling code
*/
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/namei.h>
#include "ext4.h"
#include "xattr.h"
static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
nd_set_link(nd, (char *) ei->i_data);
return NULL;
}
const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
};
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = ext4_follow_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
};

43
fs/ext4/truncate.h Normal file
View file

@ -0,0 +1,43 @@
/*
* linux/fs/ext4/truncate.h
*
* Common inline functions needed for truncate support
*/
/*
* Truncate blocks that were not used by write. We have to truncate the
* pagecache as well so that corresponding buffers get properly unmapped.
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
}
/*
* Work out how many blocks we need to proceed with the next chunk of a
* truncate transaction.
*/
static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
{
ext4_lblk_t needed;
needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
/* Give ourselves just enough room to cope with inodes in which
* i_blocks is corrupt: we've seen disk corruptions in the past
* which resulted in random data in an inode which looked enough
* like a regular file for ext4 to try to delete it. Things
* will go a bit crazy if that happens, but at least we should
* try not to panic the whole kernel. */
if (needed < 2)
needed = 2;
/* But we need to bound the transaction so we don't overflow the
* journal. */
if (needed > EXT4_MAX_TRANS_DATA)
needed = EXT4_MAX_TRANS_DATA;
return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
}

1729
fs/ext4/xattr.c Normal file

File diff suppressed because it is too large Load diff

136
fs/ext4/xattr.h Normal file
View file

@ -0,0 +1,136 @@
/*
File: fs/ext4/xattr.h
On-disk format of extended attributes for the ext4 filesystem.
(C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
#include <linux/xattr.h>
/* Magic value in attribute blocks */
#define EXT4_XATTR_MAGIC 0xEA020000
/* Maximum number of references to one attribute block */
#define EXT4_XATTR_REFCOUNT_MAX 1024
/* Name indexes */
#define EXT4_XATTR_INDEX_USER 1
#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2
#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3
#define EXT4_XATTR_INDEX_TRUSTED 4
#define EXT4_XATTR_INDEX_LUSTRE 5
#define EXT4_XATTR_INDEX_SECURITY 6
#define EXT4_XATTR_INDEX_SYSTEM 7
#define EXT4_XATTR_INDEX_RICHACL 8
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
__le32 h_refcount; /* reference count */
__le32 h_blocks; /* number of disk blocks used */
__le32 h_hash; /* hash value of all attributes */
__le32 h_checksum; /* crc32c(uuid+id+xattrblock) */
/* id = inum if refcount=1, blknum otherwise */
__u32 h_reserved[3]; /* zero right now */
};
struct ext4_xattr_ibody_header {
__le32 h_magic; /* magic number for identification */
};
struct ext4_xattr_entry {
__u8 e_name_len; /* length of name */
__u8 e_name_index; /* attribute name index */
__le16 e_value_offs; /* offset in disk block of value */
__le32 e_value_block; /* disk block attribute is stored on (n/i) */
__le32 e_value_size; /* size of attribute value */
__le32 e_hash; /* hash value of name and value */
char e_name[0]; /* attribute name */
};
#define EXT4_XATTR_PAD_BITS 2
#define EXT4_XATTR_PAD (1<<EXT4_XATTR_PAD_BITS)
#define EXT4_XATTR_ROUND (EXT4_XATTR_PAD-1)
#define EXT4_XATTR_LEN(name_len) \
(((name_len) + EXT4_XATTR_ROUND + \
sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
#define EXT4_XATTR_NEXT(entry) \
((struct ext4_xattr_entry *)( \
(char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
#define EXT4_XATTR_SIZE(size) \
(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
#define IHDR(inode, raw_inode) \
((struct ext4_xattr_ibody_header *) \
((void *)raw_inode + \
EXT4_GOOD_OLD_INODE_SIZE + \
EXT4_I(inode)->i_extra_isize))
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
#define BFIRST(bh) ENTRY(BHDR(bh)+1)
#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
struct ext4_xattr_info {
int name_index;
const char *name;
const void *value;
size_t value_len;
};
struct ext4_xattr_search {
struct ext4_xattr_entry *first;
void *base;
void *end;
struct ext4_xattr_entry *here;
int not_found;
};
struct ext4_xattr_ibody_find {
struct ext4_xattr_search s;
struct ext4_iloc iloc;
};
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
extern void ext4_xattr_put_super(struct super_block *);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
extern const struct xattr_handler *ext4_xattr_handlers[];
extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
const char *name,
void *buffer, size_t buffer_size);
extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
extern struct mb_cache *ext4_xattr_create_cache(char *name);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr);
#else
static inline int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr)
{
return 0;
}
#endif

82
fs/ext4/xattr_security.c Normal file
View file

@ -0,0 +1,82 @@
/*
* linux/fs/ext4/xattr_security.c
* Handler for storing security labels as extended attributes.
*/
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/slab.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
static size_t
ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
const char *name, size_t name_len, int type)
{
const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
const size_t total_len = prefix_len + name_len + 1;
if (list && total_len <= list_size) {
memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
memcpy(list+prefix_len, name, name_len);
list[prefix_len + name_len] = '\0';
}
return total_len;
}
static int
ext4_xattr_security_get(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
name, buffer, size);
}
static int
ext4_xattr_security_set(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
static int
ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
{
const struct xattr *xattr;
handle_t *handle = fs_info;
int err = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
err = ext4_xattr_set_handle(handle, inode,
EXT4_XATTR_INDEX_SECURITY,
xattr->name, xattr->value,
xattr->value_len, 0);
if (err < 0)
break;
}
return err;
}
int
ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
const struct qstr *qstr)
{
return security_inode_init_security(inode, dir, qstr,
&ext4_initxattrs, handle);
}
const struct xattr_handler ext4_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext4_xattr_security_list,
.get = ext4_xattr_security_get,
.set = ext4_xattr_security_set,
};

58
fs/ext4/xattr_trusted.c Normal file
View file

@ -0,0 +1,58 @@
/*
* linux/fs/ext4/xattr_trusted.c
* Handler for trusted extended attributes.
*
* Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
#include <linux/string.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
static size_t
ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
const char *name, size_t name_len, int type)
{
const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
return 0;
if (list && total_len <= list_size) {
memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
memcpy(list+prefix_len, name, name_len);
list[prefix_len + name_len] = '\0';
}
return total_len;
}
static int
ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
static int
ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
const struct xattr_handler ext4_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.list = ext4_xattr_trusted_list,
.get = ext4_xattr_trusted_get,
.set = ext4_xattr_trusted_set,
};

61
fs/ext4/xattr_user.c Normal file
View file

@ -0,0 +1,61 @@
/*
* linux/fs/ext4/xattr_user.c
* Handler for extended user attributes.
*
* Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
#include <linux/string.h>
#include <linux/fs.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
static size_t
ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
const char *name, size_t name_len, int type)
{
const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!test_opt(dentry->d_sb, XATTR_USER))
return 0;
if (list && total_len <= list_size) {
memcpy(list, XATTR_USER_PREFIX, prefix_len);
memcpy(list+prefix_len, name, name_len);
list[prefix_len + name_len] = '\0';
}
return total_len;
}
static int
ext4_xattr_user_get(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER,
name, buffer, size);
}
static int
ext4_xattr_user_set(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER,
name, value, size, flags);
}
const struct xattr_handler ext4_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.list = ext4_xattr_user_list,
.get = ext4_xattr_user_get,
.set = ext4_xattr_user_set,
};