Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

35
fs/jbd2/Kconfig Normal file
View file

@ -0,0 +1,35 @@
config JBD2
tristate
select CRC32
select CRYPTO
select CRYPTO_CRC32C
help
This is a generic journaling layer for block devices that support
both 32-bit and 64-bit block numbers. It is currently used by
the ext4 and OCFS2 filesystems, but it could also be used to add
journal support to other file systems or block devices such
as RAID or LVM.
If you are using ext4 or OCFS2, you need to say Y here.
If you are not using ext4 or OCFS2 then you will
probably want to say N.
To compile this device as a module, choose M here. The module will be
called jbd2. If you are compiling ext4 or OCFS2 into the kernel,
you cannot compile this code as a module.
config JBD2_DEBUG
bool "JBD2 (ext4) debugging support"
depends on JBD2
help
If you are using the ext4 journaled file system (or
potentially any other filesystem/device using JBD2), this option
allows you to enable debugging output while the system is running,
in order to help track down any problems you are having.
By default, the debugging output will be turned off.
If you select Y here, then you will be able to turn on debugging
with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
number between 1 and 5. The higher the number, the more debugging
output is generated. To turn debugging off again, do
"echo 0 > /sys/module/jbd2/parameters/jbd2_debug".

7
fs/jbd2/Makefile Normal file
View file

@ -0,0 +1,7 @@
#
# Makefile for the linux journaling routines.
#
obj-$(CONFIG_JBD2) += jbd2.o
jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o

645
fs/jbd2/checkpoint.c Normal file
View file

@ -0,0 +1,645 @@
/*
* linux/fs/jbd2/checkpoint.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Checkpoint routines for the generic filesystem journaling code.
* Part of the ext2fs journaling system.
*
* Checkpointing is the process of ensuring that a section of the log is
* committed fully to disk, so that that portion of the log can be
* reused.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <trace/events/jbd2.h>
/*
* Unlink a buffer from a transaction checkpoint list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink_first(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
jh->b_cpnext->b_cpprev = jh->b_cpprev;
jh->b_cpprev->b_cpnext = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh) {
transaction->t_checkpoint_list = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh)
transaction->t_checkpoint_list = NULL;
}
}
/*
* Unlink a buffer from a transaction checkpoint(io) list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (transaction->t_checkpoint_io_list == jh) {
transaction->t_checkpoint_io_list = jh->b_cpnext;
if (transaction->t_checkpoint_io_list == jh)
transaction->t_checkpoint_io_list = NULL;
}
}
/*
* Move a buffer from the checkpoint list to the checkpoint io list
*
* Called with j_list_lock held
*/
static inline void __buffer_relink_io(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (!transaction->t_checkpoint_io_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_io_list;
jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_io_list = jh;
}
/*
* Try to release a checkpointed buffer from its transaction.
* Returns 1 if we released it and 2 if we also released the
* whole transaction.
*
* Requires j_list_lock
*/
static int __try_to_free_cp_buf(struct journal_head *jh)
{
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
if (jh->b_transaction == NULL && !buffer_locked(bh) &&
!buffer_dirty(bh) && !buffer_write_io_error(bh)) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
}
return ret;
}
/*
* __jbd2_log_wait_for_space: wait until there is space in the journal.
*
* Called under j-state_lock *only*. It will be unlocked if we have to wait
* for a checkpoint to free up some space in the log.
*/
void __jbd2_log_wait_for_space(journal_t *journal)
{
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
nblocks = jbd2_space_needed(journal);
while (jbd2_log_space_left(journal) < nblocks) {
write_unlock(&journal->j_state_lock);
mutex_lock(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
* were waiting for the checkpoint lock. If there are no
* transactions ready to be checkpointed, try to recover
* journal space by calling cleanup_journal_tail(), and if
* that doesn't work, by waiting for the currently committing
* transaction to complete. If there is absolutely no way
* to make progress, this is either a BUG or corrupted
* filesystem, so abort the journal and leave a stack
* trace for forensic evidence.
*/
write_lock(&journal->j_state_lock);
if (journal->j_flags & JBD2_ABORT) {
mutex_unlock(&journal->j_checkpoint_mutex);
return;
}
spin_lock(&journal->j_list_lock);
nblocks = jbd2_space_needed(journal);
space_left = jbd2_log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
if (journal->j_committing_transaction)
tid = journal->j_committing_transaction->t_tid;
spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock);
if (chkpt) {
jbd2_log_do_checkpoint(journal);
} else if (jbd2_cleanup_journal_tail(journal) == 0) {
/* We were able to recover space; yay! */
;
} else if (tid) {
/*
* jbd2_journal_commit_transaction() may want
* to take the checkpoint_mutex if JBD2_FLUSHED
* is set. So we need to temporarily drop it.
*/
mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
write_lock(&journal->j_state_lock);
continue;
} else {
printk(KERN_ERR "%s: needed %d blocks and "
"only had %d space available\n",
__func__, nblocks, space_left);
printk(KERN_ERR "%s: no way to get more "
"journal space in %s\n", __func__,
journal->j_devname);
WARN_ON(1);
jbd2_journal_abort(journal, 0);
}
write_lock(&journal->j_state_lock);
} else {
spin_unlock(&journal->j_list_lock);
}
mutex_unlock(&journal->j_checkpoint_mutex);
}
}
static void
__flush_batch(journal_t *journal, int *batch_count)
{
int i;
struct blk_plug plug;
blk_start_plug(&plug);
for (i = 0; i < *batch_count; i++)
write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
blk_finish_plug(&plug);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = journal->j_chkpt_bhs[i];
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
}
*batch_count = 0;
}
/*
* Perform an actual checkpoint. We take the first transaction on the
* list of transactions to be checkpointed and send all its buffers
* to disk. We submit larger chunks of data at once.
*
* The journal should be locked before calling this function.
* Called with j_checkpoint_mutex held.
*/
int jbd2_log_do_checkpoint(journal_t *journal)
{
struct journal_head *jh;
struct buffer_head *bh;
transaction_t *transaction;
tid_t this_tid;
int result, batch_count = 0;
jbd_debug(1, "Start checkpoint\n");
/*
* First thing: if there are any transactions in the log which
* don't need checkpointing, just eliminate them from the
* journal straight away.
*/
result = jbd2_cleanup_journal_tail(journal);
trace_jbd2_checkpoint(journal, result);
jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
/*
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
result = 0;
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
transaction = journal->j_checkpoint_transactions;
if (transaction->t_chp_stats.cs_chp_time == 0)
transaction->t_chp_stats.cs_chp_time = jiffies;
this_tid = transaction->t_tid;
restart:
/*
* If someone cleaned up this transaction while we slept, we're
* done (maybe it's a new transaction, but it fell at the same
* address).
*/
if (journal->j_checkpoint_transactions != transaction ||
transaction->t_tid != this_tid)
goto out;
/* checkpoint all of the transaction's buffers */
while (transaction->t_checkpoint_list) {
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
if (buffer_locked(bh)) {
spin_unlock(&journal->j_list_lock);
get_bh(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
goto retry;
}
if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
transaction->t_chp_stats.cs_forced_to_close++;
spin_unlock(&journal->j_list_lock);
if (unlikely(journal->j_flags & JBD2_UNMOUNT))
/*
* The journal thread is dead; so
* starting and waiting for a commit
* to finish will cause us to wait for
* a _very_ long time.
*/
printk(KERN_ERR
"JBD2: %s: Waiting for Godot: block %llu\n",
journal->j_devname, (unsigned long long) bh->b_blocknr);
jbd2_log_start_commit(journal, tid);
jbd2_log_wait_commit(journal, tid);
goto retry;
}
if (!buffer_dirty(bh)) {
if (unlikely(buffer_write_io_error(bh)) && !result)
result = -EIO;
BUFFER_TRACE(bh, "remove from checkpoint");
if (__jbd2_journal_remove_checkpoint(jh))
/* The transaction was released; we're done */
goto out;
continue;
}
/*
* Important: we are about to write the buffer, and
* possibly block, while still holding the journal
* lock. We cannot afford to let the transaction
* logic start messing around with this buffer before
* we write it to disk, as that would break
* recoverability.
*/
BUFFER_TRACE(bh, "queue");
get_bh(bh);
J_ASSERT_BH(bh, !buffer_jwrite(bh));
journal->j_chkpt_bhs[batch_count++] = bh;
__buffer_relink_io(jh);
transaction->t_chp_stats.cs_written++;
if ((batch_count == JBD2_NR_BATCH) ||
need_resched() ||
spin_needbreak(&journal->j_list_lock))
goto unlock_and_flush;
}
if (batch_count) {
unlock_and_flush:
spin_unlock(&journal->j_list_lock);
retry:
if (batch_count)
__flush_batch(journal, &batch_count);
spin_lock(&journal->j_list_lock);
goto restart;
}
/*
* Now we issued all of the transaction's buffers, let's deal
* with the buffers that are out for I/O.
*/
restart2:
/* Did somebody clean up the transaction in the meanwhile? */
if (journal->j_checkpoint_transactions != transaction ||
transaction->t_tid != this_tid)
goto out;
while (transaction->t_checkpoint_io_list) {
jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
if (buffer_locked(bh)) {
spin_unlock(&journal->j_list_lock);
get_bh(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
spin_lock(&journal->j_list_lock);
goto restart2;
}
if (unlikely(buffer_write_io_error(bh)) && !result)
result = -EIO;
/*
* Now in whatever state the buffer currently is, we
* know that it has been written out and so we can
* drop it from the list
*/
if (__jbd2_journal_remove_checkpoint(jh))
break;
}
out:
spin_unlock(&journal->j_list_lock);
if (result < 0)
jbd2_journal_abort(journal, result);
else
result = jbd2_cleanup_journal_tail(journal);
return (result < 0) ? result : 0;
}
/*
* Check the list of checkpoint transactions for the journal to see if
* we have already got rid of any since the last update of the log tail
* in the journal superblock. If so, we can instantly roll the
* superblock forward to remove those transactions from the log.
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
* Called with the journal lock held.
*
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
* even in abort state, but we must not update the super block if
* checkpointing may have failed. Otherwise, we would lose some metadata
* buffers which should be written-back to the filesystem.
*/
int jbd2_cleanup_journal_tail(journal_t *journal)
{
tid_t first_tid;
unsigned long blocknr;
if (is_journal_aborted(journal))
return 1;
if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
return 1;
J_ASSERT(blocknr != 0);
/*
* We need to make sure that any blocks that were recently written out
* --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
* we drop the transactions from the journal. It's unlikely this will
* be necessary, especially with an appropriately sized journal, but we
* need this to guarantee correctness. Fortunately
* jbd2_cleanup_journal_tail() doesn't get called all that often.
*/
if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
__jbd2_update_log_tail(journal, first_tid, blocknr);
return 0;
}
/* Checkpoint list management */
/*
* journal_clean_one_cp_list
*
* Find all the written-back checkpoint buffers in the given list and
* release them.
*
* Called with j_list_lock held.
* Returns 1 if we freed the transaction, 0 otherwise.
*/
static int journal_clean_one_cp_list(struct journal_head *jh)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret;
int freed = 0;
if (!jh)
return 0;
last_jh = jh->b_cpprev;
do {
jh = next_jh;
next_jh = jh->b_cpnext;
ret = __try_to_free_cp_buf(jh);
if (!ret)
return freed;
if (ret == 2)
return 1;
freed = 1;
/*
* This function only frees up some memory
* if possible so we dont have an obligation
* to finish processing. Bail out if preemption
* requested:
*/
if (need_resched())
return freed;
} while (jh != last_jh);
return freed;
}
/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
*
* Called with j_list_lock held.
*/
void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
{
transaction_t *transaction, *last_transaction, *next_transaction;
int ret;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
return;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
ret = journal_clean_one_cp_list(transaction->t_checkpoint_list);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
* preemption requested:
*/
if (need_resched())
return;
if (ret)
continue;
/*
* It is essential that we are as careful as in the case of
* t_checkpoint_list with removing the buffer from the list as
* we can possibly see not yet submitted buffers on io_list
*/
ret = journal_clean_one_cp_list(transaction->
t_checkpoint_io_list);
if (need_resched())
return;
/*
* Stop scanning if we couldn't free the transaction. This
* avoids pointless scanning of transactions which still
* weren't checkpointed.
*/
if (!ret)
return;
} while (transaction != last_transaction);
}
/*
* journal_remove_checkpoint: called after a buffer has been committed
* to disk (either by being write-back flushed to disk, or being
* committed to the log).
*
* We cannot safely clean a transaction out of the log until all of the
* buffer updates committed in that transaction have safely been stored
* elsewhere on disk. To achieve this, all of the buffers in a
* transaction need to be maintained on the transaction's checkpoint
* lists until they have been rewritten, at which point this function is
* called to remove the buffer from the existing transaction's
* checkpoint lists.
*
* The function returns 1 if it frees the transaction, 0 otherwise.
* The function can free jh and bh.
*
* This function is called with j_list_lock held.
*/
int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
{
struct transaction_chp_stats_s *stats;
transaction_t *transaction;
journal_t *journal;
int ret = 0;
JBUFFER_TRACE(jh, "entry");
if ((transaction = jh->b_cp_transaction) == NULL) {
JBUFFER_TRACE(jh, "not on transaction");
goto out;
}
journal = transaction->t_journal;
JBUFFER_TRACE(jh, "removing from transaction");
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
jbd2_journal_put_journal_head(jh);
if (transaction->t_checkpoint_list != NULL ||
transaction->t_checkpoint_io_list != NULL)
goto out;
/*
* There is one special case to worry about: if we have just pulled the
* buffer off a running or committing transaction's checkpoing list,
* then even if the checkpoint list is empty, the transaction obviously
* cannot be dropped!
*
* The locking here around t_state is a bit sleazy.
* See the comment at the end of jbd2_journal_commit_transaction().
*/
if (transaction->t_state != T_FINISHED)
goto out;
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
stats = &transaction->t_chp_stats;
if (stats->cs_chp_time)
stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
jiffies);
trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
transaction->t_tid, stats);
__jbd2_journal_drop_transaction(journal, transaction);
jbd2_journal_free_transaction(transaction);
ret = 1;
out:
return ret;
}
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
* the log.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
transaction_t *transaction)
{
JBUFFER_TRACE(jh, "entry");
J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
/* Get reference for checkpointing transaction */
jbd2_journal_grab_journal_head(jh2bh(jh));
jh->b_cp_transaction = transaction;
if (!transaction->t_checkpoint_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_list;
jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_list = jh;
}
/*
* We've finished with this transaction structure: adios...
*
* The transaction must have no links except for the checkpoint by this
* point.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
{
assert_spin_locked(&journal->j_list_lock);
if (transaction->t_cpnext) {
transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions =
transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions = NULL;
}
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(atomic_read(&transaction->t_updates) == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
trace_jbd2_drop_transaction(journal, transaction);
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
}

1164
fs/jbd2/commit.c Normal file

File diff suppressed because it is too large Load diff

2672
fs/jbd2/journal.c Normal file

File diff suppressed because it is too large Load diff

869
fs/jbd2/recovery.c Normal file
View file

@ -0,0 +1,869 @@
/*
* linux/fs/jbd2/recovery.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999-2000 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal recovery routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/crc32.h>
#include <linux/blkdev.h>
#endif
/*
* Maintain information about the progress of the recovery job, so that
* the different passes can carry information between them.
*/
struct recovery_info
{
tid_t start_transaction;
tid_t end_transaction;
int nr_replays;
int nr_revokes;
int nr_revoke_hits;
};
enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
tid_t, struct recovery_info *);
#ifdef __KERNEL__
/* Release readahead buffers after use */
static void journal_brelse_array(struct buffer_head *b[], int n)
{
while (--n >= 0)
brelse (b[n]);
}
/*
* When reading from the journal, we are going through the block device
* layer directly and so there is no readahead being done for us. We
* need to implement any readahead ourselves if we want it to happen at
* all. Recovery is basically one long sequential read, so make sure we
* do the IO in reasonably large chunks.
*
* This is not so critical that we need to be enormously clever about
* the readahead size, though. 128K is a purely arbitrary, good-enough
* fixed value.
*/
#define MAXBUF 8
static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
unsigned long long blocknr;
struct buffer_head *bh;
struct buffer_head * bufs[MAXBUF];
/* Do up to 128K of readahead */
max = start + (128 * 1024 / journal->j_blocksize);
if (max > journal->j_maxlen)
max = journal->j_maxlen;
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
* a time to the block device IO layer. */
nbufs = 0;
for (next = start; next < max; next++) {
err = jbd2_journal_bmap(journal, next, &blocknr);
if (err) {
printk(KERN_ERR "JBD2: bad block at offset %u\n",
next);
goto failed;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
err = -ENOMEM;
goto failed;
}
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
ll_rw_block(READ, nbufs, bufs);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
} else
brelse(bh);
}
if (nbufs)
ll_rw_block(READ, nbufs, bufs);
err = 0;
failed:
if (nbufs)
journal_brelse_array(bufs, nbufs);
return err;
}
#endif /* __KERNEL__ */
/*
* Read a block from the journal
*/
static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
unsigned long long blocknr;
struct buffer_head *bh;
*bhp = NULL;
if (offset >= journal->j_maxlen) {
printk(KERN_ERR "JBD2: corrupted journal superblock\n");
return -EIO;
}
err = jbd2_journal_bmap(journal, offset, &blocknr);
if (err) {
printk(KERN_ERR "JBD2: bad block at offset %u\n",
offset);
return err;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
return -ENOMEM;
if (!buffer_uptodate(bh)) {
/* If this is a brand new buffer, start readahead.
Otherwise, we assume we are already reading it. */
if (!buffer_req(bh))
do_readahead(journal, offset);
wait_on_buffer(bh);
}
if (!buffer_uptodate(bh)) {
printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
offset);
brelse(bh);
return -EIO;
}
*bhp = bh;
return 0;
}
static int jbd2_descr_block_csum_verify(journal_t *j,
void *buf)
{
struct jbd2_journal_block_tail *tail;
__be32 provided;
__u32 calculated;
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
sizeof(struct jbd2_journal_block_tail));
provided = tail->t_checksum;
tail->t_checksum = 0;
calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
tail->t_checksum = provided;
return provided == cpu_to_be32(calculated);
}
/*
* Count the number of in-use tags in a journal descriptor block.
*/
static int count_tags(journal_t *journal, struct buffer_head *bh)
{
char * tagp;
journal_block_tag_t * tag;
int nr = 0, size = journal->j_blocksize;
int tag_bytes = journal_tag_bytes(journal);
if (jbd2_journal_has_csum_v2or3(journal))
size -= sizeof(struct jbd2_journal_block_tail);
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes) <= size) {
tag = (journal_block_tag_t *) tagp;
nr++;
tagp += tag_bytes;
if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
tagp += 16;
if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
break;
}
return nr;
}
/* Make sure we wrap around the log correctly! */
#define wrap(journal, var) \
do { \
if (var >= (journal)->j_last) \
var -= ((journal)->j_last - (journal)->j_first); \
} while (0)
/**
* jbd2_journal_recover - recovers a on-disk journal
* @journal: the journal to recover
*
* The primary function for recovering the log contents when mounting a
* journaled device.
*
* Recovery is done in three passes. In the first pass, we look for the
* end of the log. In the second, we assemble the list of revoke
* blocks. In the third and final pass, we replay any un-revoked blocks
* in the log.
*/
int jbd2_journal_recover(journal_t *journal)
{
int err, err2;
journal_superblock_t * sb;
struct recovery_info info;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
/*
* The journal superblock's s_start field (the current log head)
* is always zero if, and only if, the journal was cleanly
* unmounted.
*/
if (!sb->s_start) {
jbd_debug(1, "No recovery required, last transaction %d\n",
be32_to_cpu(sb->s_sequence));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
return 0;
}
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
jbd_debug(1, "JBD2: recovery, exit status %d, "
"recovered transactions %u to %u\n",
err, info.start_transaction, info.end_transaction);
jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
/* Restart the log at the next transaction ID, thus invalidating
* any existing commit records in the log. */
journal->j_transaction_sequence = ++info.end_transaction;
jbd2_journal_clear_revoke(journal);
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
/* Make sure all replayed data is on permanent storage */
if (journal->j_flags & JBD2_BARRIER) {
err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
if (!err)
err = err2;
}
return err;
}
/**
* jbd2_journal_skip_recovery - Start journal and wipe exiting records
* @journal: journal to startup
*
* Locate any valid recovery information from the journal and set up the
* journal structures in memory to ignore it (presumably because the
* caller has evidence that it is out of date).
* This function does'nt appear to be exorted..
*
* We perform one pass over the journal to allow us to tell the user how
* much recovery information is being erased, and to let us initialise
* the journal transaction sequence numbers to the next unused ID.
*/
int jbd2_journal_skip_recovery(journal_t *journal)
{
int err;
struct recovery_info info;
memset (&info, 0, sizeof(info));
err = do_one_pass(journal, &info, PASS_SCAN);
if (err) {
printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
++journal->j_transaction_sequence;
} else {
#ifdef CONFIG_JBD2_DEBUG
int dropped = info.end_transaction -
be32_to_cpu(journal->j_superblock->s_sequence);
jbd_debug(1,
"JBD2: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s");
#endif
journal->j_transaction_sequence = ++info.end_transaction;
}
journal->j_tail = 0;
return err;
}
static inline unsigned long long read_tag_block(journal_t *journal,
journal_block_tag_t *tag)
{
unsigned long long block = be32_to_cpu(tag->t_blocknr);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
return block;
}
/*
* calc_chksums calculates the checksums for the blocks described in the
* descriptor block.
*/
static int calc_chksums(journal_t *journal, struct buffer_head *bh,
unsigned long *next_log_block, __u32 *crc32_sum)
{
int i, num_blks, err;
unsigned long io_block;
struct buffer_head *obh;
num_blks = count_tags(journal, bh);
/* Calculate checksum of the descriptor block. */
*crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
for (i = 0; i < num_blks; i++) {
io_block = (*next_log_block)++;
wrap(journal, *next_log_block);
err = jread(&obh, journal, io_block);
if (err) {
printk(KERN_ERR "JBD2: IO error %d recovering block "
"%lu in log\n", err, io_block);
return 1;
} else {
*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
obh->b_size);
}
put_bh(obh);
}
return 0;
}
static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
{
struct commit_header *h;
__be32 provided;
__u32 calculated;
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
h = buf;
provided = h->h_chksum[0];
h->h_chksum[0] = 0;
calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
h->h_chksum[0] = provided;
return provided == cpu_to_be32(calculated);
}
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
void *buf, __u32 sequence)
{
journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
__u32 csum32;
__be32 seq;
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
seq = cpu_to_be32(sequence);
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
return tag3->t_checksum == cpu_to_be32(csum32);
else
return tag->t_checksum == cpu_to_be16(csum32);
}
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
unsigned long next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
struct buffer_head * bh;
unsigned int sequence;
int blocktype;
int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
int descr_csum_size = 0;
int block_error = 0;
/*
* First thing is to establish what we expect to find in the log
* (in terms of transaction IDs), and where (in terms of log
* block offsets): query the superblock.
*/
sb = journal->j_superblock;
next_commit_ID = be32_to_cpu(sb->s_sequence);
next_log_block = be32_to_cpu(sb->s_start);
first_commit_ID = next_commit_ID;
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
jbd_debug(1, "Starting recovery pass %d\n", pass);
/*
* Now we walk through the log, transaction by transaction,
* making sure that each transaction has a commit block in the
* expected place. Each complete transaction gets replayed back
* into the main filesystem.
*/
while (1) {
int flags;
char * tagp;
journal_block_tag_t * tag;
struct buffer_head * obh;
struct buffer_head * nbh;
cond_resched();
/* If we already know where to stop the log traversal,
* check right now that we haven't gone past the end of
* the log. */
if (pass != PASS_SCAN)
if (tid_geq(next_commit_ID, info->end_transaction))
break;
jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
next_log_block++;
wrap(journal, next_log_block);
/* What kind of buffer is it?
*
* If it is a descriptor block, check that it has the
* expected sequence number. Otherwise, we're all done
* here. */
tmp = (journal_header_t *)bh->b_data;
if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
brelse(bh);
break;
}
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
jbd_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
if (sequence != next_commit_ID) {
brelse(bh);
break;
}
/* OK, we have a valid descriptor block which matches
* all of the sequence number checks. What are we going
* to do with it? That depends on the pass... */
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
/* Verify checksum first */
if (jbd2_journal_has_csum_v2or3(journal))
descr_csum_size =
sizeof(struct jbd2_journal_block_tail);
if (descr_csum_size > 0 &&
!jbd2_descr_block_csum_verify(journal,
bh->b_data)) {
err = -EIO;
brelse(bh);
goto failed;
}
/* If it is a valid descriptor block, replay it
* in pass REPLAY; if journal_checksums enabled, then
* calculate checksums in PASS_SCAN, otherwise,
* just skip over the blocks it describes. */
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
JBD2_HAS_COMPAT_FEATURE(journal,
JBD2_FEATURE_COMPAT_CHECKSUM) &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
&crc32_sum)) {
put_bh(bh);
break;
}
put_bh(bh);
continue;
}
next_log_block += count_tags(journal, bh);
wrap(journal, next_log_block);
put_bh(bh);
continue;
}
/* A descriptor block: we can now write all of
* the data blocks. Yay, useful work is finally
* getting done here! */
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes)
<= journal->j_blocksize - descr_csum_size) {
unsigned long io_block;
tag = (journal_block_tag_t *) tagp;
flags = be16_to_cpu(tag->t_flags);
io_block = next_log_block++;
wrap(journal, next_log_block);
err = jread(&obh, journal, io_block);
if (err) {
/* Recover what we can, but
* report failure at the end. */
success = err;
printk(KERN_ERR
"JBD2: IO error %d recovering "
"block %ld in log\n",
err, io_block);
} else {
unsigned long long blocknr;
J_ASSERT(obh != NULL);
blocknr = read_tag_block(journal,
tag);
/* If the block has been
* revoked, then we're all done
* here. */
if (jbd2_journal_test_revoke
(journal, blocknr,
next_commit_ID)) {
brelse(obh);
++info->nr_revoke_hits;
goto skip_write;
}
/* Look for block corruption */
if (!jbd2_block_tag_csum_verify(
journal, tag, obh->b_data,
be32_to_cpu(tmp->h_sequence))) {
brelse(obh);
success = -EIO;
printk(KERN_ERR "JBD2: Invalid "
"checksum recovering "
"block %llu in log\n",
blocknr);
block_error = 1;
goto skip_write;
}
/* Find a buffer for the new
* data being restored */
nbh = __getblk(journal->j_fs_dev,
blocknr,
journal->j_blocksize);
if (nbh == NULL) {
printk(KERN_ERR
"JBD2: Out of memory "
"during recovery.\n");
err = -ENOMEM;
brelse(bh);
brelse(obh);
goto failed;
}
lock_buffer(nbh);
memcpy(nbh->b_data, obh->b_data,
journal->j_blocksize);
if (flags & JBD2_FLAG_ESCAPE) {
*((__be32 *)nbh->b_data) =
cpu_to_be32(JBD2_MAGIC_NUMBER);
}
BUFFER_TRACE(nbh, "marking dirty");
set_buffer_uptodate(nbh);
mark_buffer_dirty(nbh);
BUFFER_TRACE(nbh, "marking uptodate");
++info->nr_replays;
/* ll_rw_block(WRITE, 1, &nbh); */
unlock_buffer(nbh);
brelse(obh);
brelse(nbh);
}
skip_write:
tagp += tag_bytes;
if (!(flags & JBD2_FLAG_SAME_UUID))
tagp += 16;
if (flags & JBD2_FLAG_LAST_TAG)
break;
}
brelse(bh);
continue;
case JBD2_COMMIT_BLOCK:
/* How to differentiate between interrupted commit
* and journal corruption ?
*
* {nth transaction}
* Checksum Verification Failed
* |
* ____________________
* | |
* async_commit sync_commit
* | |
* | GO TO NEXT "Journal Corruption"
* | TRANSACTION
* |
* {(n+1)th transanction}
* |
* _______|______________
* | |
* Commit block found Commit block not found
* | |
* "Journal Corruption" |
* _____________|_________
* | |
* nth trans corrupt OR nth trans
* and (n+1)th interrupted interrupted
* before commit block
* could reach the disk.
* (Cannot find the difference in above
* mentioned conditions. Hence assume
* "Interrupted Commit".)
*/
/* Found an expected commit block: if checksums
* are present verify them in PASS_SCAN; else not
* much to do other than move on to the next sequence
* number. */
if (pass == PASS_SCAN &&
JBD2_HAS_COMPAT_FEATURE(journal,
JBD2_FEATURE_COMPAT_CHECKSUM)) {
int chksum_err, chksum_seen;
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
unsigned found_chksum =
be32_to_cpu(cbh->h_chksum[0]);
chksum_err = chksum_seen = 0;
if (info->end_transaction) {
journal->j_failed_commit =
info->end_transaction;
brelse(bh);
break;
}
if (crc32_sum == found_chksum &&
cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
cbh->h_chksum_size ==
JBD2_CRC32_CHKSUM_SIZE)
chksum_seen = 1;
else if (!(cbh->h_chksum_type == 0 &&
cbh->h_chksum_size == 0 &&
found_chksum == 0 &&
!chksum_seen))
/*
* If fs is mounted using an old kernel and then
* kernel with journal_chksum is used then we
* get a situation where the journal flag has
* checksum flag set but checksums are not
* present i.e chksum = 0, in the individual
* commit blocks.
* Hence to avoid checksum failures, in this
* situation, this extra check is added.
*/
chksum_err = 1;
if (chksum_err) {
info->end_transaction = next_commit_ID;
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
break;
}
}
crc32_sum = ~0;
}
if (pass == PASS_SCAN &&
!jbd2_commit_block_csum_verify(journal,
bh->b_data)) {
info->end_transaction = next_commit_ID;
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
break;
}
}
brelse(bh);
next_commit_ID++;
continue;
case JBD2_REVOKE_BLOCK:
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
brelse(bh);
continue;
}
err = scan_revoke_records(journal, bh,
next_commit_ID, info);
brelse(bh);
if (err)
goto failed;
continue;
default:
jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
brelse(bh);
goto done;
}
}
done:
/*
* We broke out of the log scan loop: either we came to the
* known end of the log or we found an unexpected block in the
* log. If the latter happened, then we know that the "current"
* transaction marks the end of the valid log.
*/
if (pass == PASS_SCAN) {
if (!info->end_transaction)
info->end_transaction = next_commit_ID;
} else {
/* It's really bad news if different passes end up at
* different places (but possible due to IO errors). */
if (info->end_transaction != next_commit_ID) {
printk(KERN_ERR "JBD2: recovery pass %d ended at "
"transaction %u, expected %u\n",
pass, next_commit_ID, info->end_transaction);
if (!success)
success = -EIO;
}
}
if (block_error && success == 0)
success = -EIO;
return success;
failed:
return err;
}
static int jbd2_revoke_block_csum_verify(journal_t *j,
void *buf)
{
struct jbd2_journal_revoke_tail *tail;
__be32 provided;
__u32 calculated;
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
sizeof(struct jbd2_journal_revoke_tail));
provided = tail->r_checksum;
tail->r_checksum = 0;
calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
tail->r_checksum = provided;
return provided == cpu_to_be32(calculated);
}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
tid_t sequence, struct recovery_info *info)
{
jbd2_journal_revoke_header_t *header;
int offset, max;
int record_len = 4;
header = (jbd2_journal_revoke_header_t *) bh->b_data;
offset = sizeof(jbd2_journal_revoke_header_t);
max = be32_to_cpu(header->r_count);
if (!jbd2_revoke_block_csum_verify(journal, header))
return -EINVAL;
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
record_len = 8;
while (offset + record_len <= max) {
unsigned long long blocknr;
int err;
if (record_len == 4)
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
else
blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
offset += record_len;
err = jbd2_journal_set_revoke(journal, blocknr, sequence);
if (err)
return err;
++info->nr_revokes;
}
return 0;
}

762
fs/jbd2/revoke.c Normal file
View file

@ -0,0 +1,762 @@
/*
* linux/fs/jbd2/revoke.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 2000
*
* Copyright 2000 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal revoke routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*
* Revoke is the mechanism used to prevent old log records for deleted
* metadata from being replayed on top of newer data using the same
* blocks. The revoke mechanism is used in two separate places:
*
* + Commit: during commit we write the entire list of the current
* transaction's revoked blocks to the journal
*
* + Recovery: during recovery we record the transaction ID of all
* revoked blocks. If there are multiple revoke records in the log
* for a single block, only the last one counts, and if there is a log
* entry for a block beyond the last revoke, then that log entry still
* gets replayed.
*
* We can get interactions between revokes and new log data within a
* single transaction:
*
* Block is revoked and then journaled:
* The desired end result is the journaling of the new block, so we
* cancel the revoke before the transaction commits.
*
* Block is journaled and then revoked:
* The revoke must take precedence over the write of the block, so we
* need either to cancel the journal entry or to write the revoke
* later in the log than the log block. In this case, we choose the
* latter: journaling a block cancels any revoke record for that block
* in the current transaction, so any revoke for that block in the
* transaction must have happened after the block was journaled and so
* the revoke must take precedence.
*
* Block is revoked and then written as data:
* The data write is allowed to succeed, but the revoke is _not_
* cancelled. We still need to prevent old log records from
* overwriting the new data. We don't even need to clear the revoke
* bit here.
*
* We cache revoke status of a buffer in the current transaction in b_states
* bits. As the name says, revokevalid flag indicates that the cached revoke
* status of a buffer is valid and we can rely on the cached status.
*
* Revoke information on buffers is a tri-state value:
*
* RevokeValid clear: no cached revoke status, need to look it up
* RevokeValid set, Revoked clear:
* buffer has not been revoked, and cancel_revoke
* need do nothing.
* RevokeValid set, Revoked set:
* buffer has been revoked.
*
* Locking rules:
* We keep two hash tables of revoke records. One hashtable belongs to the
* running transaction (is pointed to by journal->j_revoke), the other one
* belongs to the committing transaction. Accesses to the second hash table
* happen only from the kjournald and no other thread touches this table. Also
* journal_switch_revoke_table() which switches which hashtable belongs to the
* running and which to the committing transaction is called only from
* kjournald. Therefore we need no locks when accessing the hashtable belonging
* to the committing transaction.
*
* All users operating on the hash table belonging to the running transaction
* have a handle to the transaction. Therefore they are safe from kjournald
* switching hash tables under them. For operations on the lists of entries in
* the hash table j_revoke_lock is used.
*
* Finally, also replay code uses the hash tables but at this moment no one else
* can touch them (filesystem isn't mounted yet) and hence no locking is
* needed.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/log2.h>
#include <linux/hash.h>
#endif
static struct kmem_cache *jbd2_revoke_record_cache;
static struct kmem_cache *jbd2_revoke_table_cache;
/* Each revoke record represents one single revoked block. During
journal replay, this involves recording the transaction ID of the
last transaction to revoke this block. */
struct jbd2_revoke_record_s
{
struct list_head hash;
tid_t sequence; /* Used for recovery only */
unsigned long long blocknr;
};
/* The revoke table is just a simple hash table of revoke records. */
struct jbd2_revoke_table_s
{
/* It is conceivable that we might want a larger hash table
* for recovery. Must be a power of two. */
int hash_size;
int hash_shift;
struct list_head *hash_table;
};
#ifdef __KERNEL__
static void write_one_revoke_record(journal_t *, transaction_t *,
struct list_head *,
struct buffer_head **, int *,
struct jbd2_revoke_record_s *, int);
static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
#endif
/* Utility functions to maintain the revoke table */
static inline int hash(journal_t *journal, unsigned long long block)
{
return hash_64(block, journal->j_revoke->hash_shift);
}
static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
tid_t seq)
{
struct list_head *hash_list;
struct jbd2_revoke_record_s *record;
repeat:
record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
if (!record)
goto oom;
record->sequence = seq;
record->blocknr = blocknr;
hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
spin_lock(&journal->j_revoke_lock);
list_add(&record->hash, hash_list);
spin_unlock(&journal->j_revoke_lock);
return 0;
oom:
if (!journal_oom_retry)
return -ENOMEM;
jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
yield();
goto repeat;
}
/* Find a revoke record in the journal's hash table. */
static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
unsigned long long blocknr)
{
struct list_head *hash_list;
struct jbd2_revoke_record_s *record;
hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
spin_lock(&journal->j_revoke_lock);
record = (struct jbd2_revoke_record_s *) hash_list->next;
while (&(record->hash) != hash_list) {
if (record->blocknr == blocknr) {
spin_unlock(&journal->j_revoke_lock);
return record;
}
record = (struct jbd2_revoke_record_s *) record->hash.next;
}
spin_unlock(&journal->j_revoke_lock);
return NULL;
}
void jbd2_journal_destroy_revoke_caches(void)
{
if (jbd2_revoke_record_cache) {
kmem_cache_destroy(jbd2_revoke_record_cache);
jbd2_revoke_record_cache = NULL;
}
if (jbd2_revoke_table_cache) {
kmem_cache_destroy(jbd2_revoke_table_cache);
jbd2_revoke_table_cache = NULL;
}
}
int __init jbd2_journal_init_revoke_caches(void)
{
J_ASSERT(!jbd2_revoke_record_cache);
J_ASSERT(!jbd2_revoke_table_cache);
jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
if (!jbd2_revoke_record_cache)
goto record_cache_failure;
jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
SLAB_TEMPORARY);
if (!jbd2_revoke_table_cache)
goto table_cache_failure;
return 0;
table_cache_failure:
jbd2_journal_destroy_revoke_caches();
record_cache_failure:
return -ENOMEM;
}
static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
{
int shift = 0;
int tmp = hash_size;
struct jbd2_revoke_table_s *table;
table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
if (!table)
goto out;
while((tmp >>= 1UL) != 0UL)
shift++;
table->hash_size = hash_size;
table->hash_shift = shift;
table->hash_table =
kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
if (!table->hash_table) {
kmem_cache_free(jbd2_revoke_table_cache, table);
table = NULL;
goto out;
}
for (tmp = 0; tmp < hash_size; tmp++)
INIT_LIST_HEAD(&table->hash_table[tmp]);
out:
return table;
}
static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
{
int i;
struct list_head *hash_list;
for (i = 0; i < table->hash_size; i++) {
hash_list = &table->hash_table[i];
J_ASSERT(list_empty(hash_list));
}
kfree(table->hash_table);
kmem_cache_free(jbd2_revoke_table_cache, table);
}
/* Initialise the revoke table for a given journal to a given size. */
int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
{
J_ASSERT(journal->j_revoke_table[0] == NULL);
J_ASSERT(is_power_of_2(hash_size));
journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size);
if (!journal->j_revoke_table[0])
goto fail0;
journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size);
if (!journal->j_revoke_table[1])
goto fail1;
journal->j_revoke = journal->j_revoke_table[1];
spin_lock_init(&journal->j_revoke_lock);
return 0;
fail1:
jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
fail0:
return -ENOMEM;
}
/* Destroy a journal's revoke table. The table must already be empty! */
void jbd2_journal_destroy_revoke(journal_t *journal)
{
journal->j_revoke = NULL;
if (journal->j_revoke_table[0])
jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
if (journal->j_revoke_table[1])
jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]);
}
#ifdef __KERNEL__
/*
* jbd2_journal_revoke: revoke a given buffer_head from the journal. This
* prevents the block from being replayed during recovery if we take a
* crash after this current transaction commits. Any subsequent
* metadata writes of the buffer in this transaction cancel the
* revoke.
*
* Note that this call may block --- it is up to the caller to make
* sure that there are no further calls to journal_write_metadata
* before the revoke is complete. In ext3, this implies calling the
* revoke before clearing the block bitmap when we are deleting
* metadata.
*
* Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
* parameter, but does _not_ forget the buffer_head if the bh was only
* found implicitly.
*
* bh_in may not be a journalled buffer - it may have come off
* the hash tables without an attached journal_head.
*
* If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
* by one.
*/
int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
struct buffer_head *bh_in)
{
struct buffer_head *bh = NULL;
journal_t *journal;
struct block_device *bdev;
int err;
might_sleep();
if (bh_in)
BUFFER_TRACE(bh_in, "enter");
journal = handle->h_transaction->t_journal;
if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
J_ASSERT (!"Cannot set revoke feature!");
return -EINVAL;
}
bdev = journal->j_fs_dev;
bh = bh_in;
if (!bh) {
bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
#ifdef JBD2_EXPENSIVE_CHECKING
else {
struct buffer_head *bh2;
/* If there is a different buffer_head lying around in
* memory anywhere... */
bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
/* ...then it better be revoked too,
* since it's illegal to create a revoke
* record against a buffer_head which is
* not marked revoked --- that would
* risk missing a subsequent revoke
* cancel. */
J_ASSERT_BH(bh2, buffer_revoked(bh2));
put_bh(bh2);
}
}
#endif
/* We really ought not ever to revoke twice in a row without
first having the revoke cancelled: it's illegal to free a
block twice without allocating it in between! */
if (bh) {
if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
"inconsistent data on disk")) {
if (!bh_in)
brelse(bh);
return -EIO;
}
set_buffer_revoked(bh);
set_buffer_revokevalid(bh);
if (bh_in) {
BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
jbd2_journal_forget(handle, bh_in);
} else {
BUFFER_TRACE(bh, "call brelse");
__brelse(bh);
}
}
jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
return err;
}
/*
* Cancel an outstanding revoke. For use only internally by the
* journaling code (called from jbd2_journal_get_write_access).
*
* We trust buffer_revoked() on the buffer if the buffer is already
* being journaled: if there is no revoke pending on the buffer, then we
* don't do anything here.
*
* This would break if it were possible for a buffer to be revoked and
* discarded, and then reallocated within the same transaction. In such
* a case we would have lost the revoked bit, but when we arrived here
* the second time we would still have a pending revoke to cancel. So,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
*/
int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
struct jbd2_revoke_record_s *record;
journal_t *journal = handle->h_transaction->t_journal;
int need_cancel;
int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
/* Is the existing Revoke bit valid? If so, we trust it, and
* only perform the full cancel if the revoke bit is set. If
* not, we can't trust the revoke bit, and we need to do the
* full search for a revoke record. */
if (test_set_buffer_revokevalid(bh)) {
need_cancel = test_clear_buffer_revoked(bh);
} else {
need_cancel = 1;
clear_buffer_revoked(bh);
}
if (need_cancel) {
record = find_revoke_record(journal, bh->b_blocknr);
if (record) {
jbd_debug(4, "cancelled existing revoke on "
"blocknr %llu\n", (unsigned long long)bh->b_blocknr);
spin_lock(&journal->j_revoke_lock);
list_del(&record->hash);
spin_unlock(&journal->j_revoke_lock);
kmem_cache_free(jbd2_revoke_record_cache, record);
did_revoke = 1;
}
}
#ifdef JBD2_EXPENSIVE_CHECKING
/* There better not be one left behind by now! */
record = find_revoke_record(journal, bh->b_blocknr);
J_ASSERT_JH(jh, record == NULL);
#endif
/* Finally, have we just cleared revoke on an unhashed
* buffer_head? If so, we'd better make sure we clear the
* revoked status on any hashed alias too, otherwise the revoke
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
__brelse(bh2);
}
}
return did_revoke;
}
/*
* journal_clear_revoked_flag clears revoked flag of buffers in
* revoke table to reflect there is no revoked buffers in the next
* transaction which is going to be started.
*/
void jbd2_clear_buffer_revoked_flags(journal_t *journal)
{
struct jbd2_revoke_table_s *revoke = journal->j_revoke;
int i = 0;
for (i = 0; i < revoke->hash_size; i++) {
struct list_head *hash_list;
struct list_head *list_entry;
hash_list = &revoke->hash_table[i];
list_for_each(list_entry, hash_list) {
struct jbd2_revoke_record_s *record;
struct buffer_head *bh;
record = (struct jbd2_revoke_record_s *)list_entry;
bh = __find_get_block(journal->j_fs_dev,
record->blocknr,
journal->j_blocksize);
if (bh) {
clear_buffer_revoked(bh);
__brelse(bh);
}
}
}
}
/* journal_switch_revoke table select j_revoke for next transaction
* we do not want to suspend any processing until all revokes are
* written -bzzz
*/
void jbd2_journal_switch_revoke_table(journal_t *journal)
{
int i;
if (journal->j_revoke == journal->j_revoke_table[0])
journal->j_revoke = journal->j_revoke_table[1];
else
journal->j_revoke = journal->j_revoke_table[0];
for (i = 0; i < journal->j_revoke->hash_size; i++)
INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
}
/*
* Write revoke records to the journal for all entries in the current
* revoke hash, deleting the entries as we go.
*/
void jbd2_journal_write_revoke_records(journal_t *journal,
transaction_t *transaction,
struct list_head *log_bufs,
int write_op)
{
struct buffer_head *descriptor;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
struct list_head *hash_list;
int i, offset, count;
descriptor = NULL;
offset = 0;
count = 0;
/* select revoke table for committing transaction */
revoke = journal->j_revoke == journal->j_revoke_table[0] ?
journal->j_revoke_table[1] : journal->j_revoke_table[0];
for (i = 0; i < revoke->hash_size; i++) {
hash_list = &revoke->hash_table[i];
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s *)
hash_list->next;
write_one_revoke_record(journal, transaction, log_bufs,
&descriptor, &offset,
record, write_op);
count++;
list_del(&record->hash);
kmem_cache_free(jbd2_revoke_record_cache, record);
}
}
if (descriptor)
flush_descriptor(journal, descriptor, offset, write_op);
jbd_debug(1, "Wrote %d revoke records\n", count);
}
/*
* Write out one revoke record. We need to create a new descriptor
* block if the old one is full or if we have not already created one.
*/
static void write_one_revoke_record(journal_t *journal,
transaction_t *transaction,
struct list_head *log_bufs,
struct buffer_head **descriptorp,
int *offsetp,
struct jbd2_revoke_record_s *record,
int write_op)
{
int csum_size = 0;
struct buffer_head *descriptor;
int offset;
journal_header_t *header;
/* If we are already aborting, this all becomes a noop. We
still need to go round the loop in
jbd2_journal_write_revoke_records in order to free all of the
revoke records: only the IO to the journal is omitted. */
if (is_journal_aborted(journal))
return;
descriptor = *descriptorp;
offset = *offsetp;
/* Do we need to leave space at the end for a checksum? */
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
/* Make sure we have a descriptor with space left for the record */
if (descriptor) {
if (offset >= journal->j_blocksize - csum_size) {
flush_descriptor(journal, descriptor, offset, write_op);
descriptor = NULL;
}
}
if (!descriptor) {
descriptor = jbd2_journal_get_descriptor_buffer(journal);
if (!descriptor)
return;
header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
BUFFER_TRACE(descriptor, "file in log_bufs");
jbd2_file_log_bh(log_bufs, descriptor);
offset = sizeof(jbd2_journal_revoke_header_t);
*descriptorp = descriptor;
}
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
* ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
offset += 8;
} else {
* ((__be32 *)(&descriptor->b_data[offset])) =
cpu_to_be32(record->blocknr);
offset += 4;
}
*offsetp = offset;
}
static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
{
struct jbd2_journal_revoke_tail *tail;
__u32 csum;
if (!jbd2_journal_has_csum_v2or3(j))
return;
tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_revoke_tail));
tail->r_checksum = 0;
csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->r_checksum = cpu_to_be32(csum);
}
/*
* Flush a revoke descriptor out to the journal. If we are aborting,
* this is a noop; otherwise we are generating a buffer which needs to
* be waited for during commit, so it has to go onto the appropriate
* journal buffer list.
*/
static void flush_descriptor(journal_t *journal,
struct buffer_head *descriptor,
int offset, int write_op)
{
jbd2_journal_revoke_header_t *header;
if (is_journal_aborted(journal)) {
put_bh(descriptor);
return;
}
header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
header->r_count = cpu_to_be32(offset);
jbd2_revoke_csum_set(journal, descriptor);
set_buffer_jwrite(descriptor);
BUFFER_TRACE(descriptor, "write");
set_buffer_dirty(descriptor);
write_dirty_buffer(descriptor, write_op);
}
#endif
/*
* Revoke support for recovery.
*
* Recovery needs to be able to:
*
* record all revoke records, including the tid of the latest instance
* of each revoke in the journal
*
* check whether a given block in a given transaction should be replayed
* (ie. has not been revoked by a revoke record in that or a subsequent
* transaction)
*
* empty the revoke table after recovery.
*/
/*
* First, setting revoke records. We create a new revoke record for
* every block ever revoked in the log as we scan it for recovery, and
* we update the existing records if we find multiple revokes for a
* single block.
*/
int jbd2_journal_set_revoke(journal_t *journal,
unsigned long long blocknr,
tid_t sequence)
{
struct jbd2_revoke_record_s *record;
record = find_revoke_record(journal, blocknr);
if (record) {
/* If we have multiple occurrences, only record the
* latest sequence number in the hashed record */
if (tid_gt(sequence, record->sequence))
record->sequence = sequence;
return 0;
}
return insert_revoke_hash(journal, blocknr, sequence);
}
/*
* Test revoke records. For a given block referenced in the log, has
* that block been revoked? A revoke record with a given transaction
* sequence number revokes all blocks in that transaction and earlier
* ones, but later transactions still need replayed.
*/
int jbd2_journal_test_revoke(journal_t *journal,
unsigned long long blocknr,
tid_t sequence)
{
struct jbd2_revoke_record_s *record;
record = find_revoke_record(journal, blocknr);
if (!record)
return 0;
if (tid_gt(sequence, record->sequence))
return 0;
return 1;
}
/*
* Finally, once recovery is over, we need to clear the revoke table so
* that it can be reused by the running filesystem.
*/
void jbd2_journal_clear_revoke(journal_t *journal)
{
int i;
struct list_head *hash_list;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
revoke = journal->j_revoke;
for (i = 0; i < revoke->hash_size; i++) {
hash_list = &revoke->hash_table[i];
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s*) hash_list->next;
list_del(&record->hash);
kmem_cache_free(jbd2_revoke_record_cache, record);
}
}
}

2480
fs/jbd2/transaction.c Normal file

File diff suppressed because it is too large Load diff