Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

30
fs/jbd/Kconfig Normal file
View file

@ -0,0 +1,30 @@
config JBD
tristate
help
This is a generic journalling layer for block devices. It is
currently used by the ext3 file system, but it could also be
used to add journal support to other file systems or block
devices such as RAID or LVM.
If you are using the ext3 file system, you need to say Y here.
If you are not using ext3 then you will probably want to say N.
To compile this device as a module, choose M here: the module will be
called jbd. If you are compiling ext3 into the kernel, you
cannot compile this code as a module.
config JBD_DEBUG
bool "JBD (ext3) debugging support"
depends on JBD && DEBUG_FS
help
If you are using the ext3 journaled file system (or potentially any
other file system/device using JBD), this option allows you to
enable debugging output while the system is running, in order to
help track down any problems you are having. By default the
debugging output will be turned off.
If you select Y here, then you will be able to turn on debugging
with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
number between 1 and 5, the higher the number, the more debugging
output is generated. To turn debugging off again, do
"echo 0 > /sys/kernel/debug/jbd/jbd-debug".

7
fs/jbd/Makefile Normal file
View file

@ -0,0 +1,7 @@
#
# Makefile for the linux journaling routines.
#
obj-$(CONFIG_JBD) += jbd.o
jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o

782
fs/jbd/checkpoint.c Normal file
View file

@ -0,0 +1,782 @@
/*
* linux/fs/jbd/checkpoint.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Checkpoint routines for the generic filesystem journaling code.
* Part of the ext2fs journaling system.
*
* Checkpointing is the process of ensuring that a section of the log is
* committed fully to disk, so that that portion of the log can be
* reused.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <trace/events/jbd.h>
/*
* Unlink a buffer from a transaction checkpoint list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink_first(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
jh->b_cpnext->b_cpprev = jh->b_cpprev;
jh->b_cpprev->b_cpnext = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh) {
transaction->t_checkpoint_list = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh)
transaction->t_checkpoint_list = NULL;
}
}
/*
* Unlink a buffer from a transaction checkpoint(io) list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (transaction->t_checkpoint_io_list == jh) {
transaction->t_checkpoint_io_list = jh->b_cpnext;
if (transaction->t_checkpoint_io_list == jh)
transaction->t_checkpoint_io_list = NULL;
}
}
/*
* Move a buffer from the checkpoint list to the checkpoint io list
*
* Called with j_list_lock held
*/
static inline void __buffer_relink_io(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (!transaction->t_checkpoint_io_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_io_list;
jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_io_list = jh;
}
/*
* Try to release a checkpointed buffer from its transaction.
* Returns 1 if we released it and 2 if we also released the
* whole transaction.
*
* Requires j_list_lock
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __try_to_free_cp_buf(struct journal_head *jh)
{
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
!buffer_dirty(bh) && !buffer_write_io_error(bh)) {
/*
* Get our reference so that bh cannot be freed before
* we unlock it
*/
get_bh(bh);
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __journal_remove_checkpoint(jh) + 1;
jbd_unlock_bh_state(bh);
BUFFER_TRACE(bh, "release");
__brelse(bh);
} else {
jbd_unlock_bh_state(bh);
}
return ret;
}
/*
* __log_wait_for_space: wait until there is space in the journal.
*
* Called under j-state_lock *only*. It will be unlocked if we have to wait
* for a checkpoint to free up some space in the log.
*/
void __log_wait_for_space(journal_t *journal)
{
int nblocks, space_left;
assert_spin_locked(&journal->j_state_lock);
nblocks = jbd_space_needed(journal);
while (__log_space_left(journal) < nblocks) {
if (journal->j_flags & JFS_ABORT)
return;
spin_unlock(&journal->j_state_lock);
mutex_lock(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
* were waiting for the checkpoint lock. If there are no
* transactions ready to be checkpointed, try to recover
* journal space by calling cleanup_journal_tail(), and if
* that doesn't work, by waiting for the currently committing
* transaction to complete. If there is absolutely no way
* to make progress, this is either a BUG or corrupted
* filesystem, so abort the journal and leave a stack
* trace for forensic evidence.
*/
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
nblocks = jbd_space_needed(journal);
space_left = __log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
if (journal->j_committing_transaction)
tid = journal->j_committing_transaction->t_tid;
spin_unlock(&journal->j_list_lock);
spin_unlock(&journal->j_state_lock);
if (chkpt) {
log_do_checkpoint(journal);
} else if (cleanup_journal_tail(journal) == 0) {
/* We were able to recover space; yay! */
;
} else if (tid) {
log_wait_commit(journal, tid);
} else {
printk(KERN_ERR "%s: needed %d blocks and "
"only had %d space available\n",
__func__, nblocks, space_left);
printk(KERN_ERR "%s: no way to get more "
"journal space\n", __func__);
WARN_ON(1);
journal_abort(journal, 0);
}
spin_lock(&journal->j_state_lock);
} else {
spin_unlock(&journal->j_list_lock);
}
mutex_unlock(&journal->j_checkpoint_mutex);
}
}
/*
* We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
* The caller must restart a list walk. Wait for someone else to run
* jbd_unlock_bh_state().
*/
static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
__releases(journal->j_list_lock)
{
get_bh(bh);
spin_unlock(&journal->j_list_lock);
jbd_lock_bh_state(bh);
jbd_unlock_bh_state(bh);
put_bh(bh);
}
/*
* Clean up transaction's list of buffers submitted for io.
* We wait for any pending IO to complete and remove any clean
* buffers. Note that we take the buffers in the opposite ordering
* from the one in which they were submitted for IO.
*
* Return 0 on success, and return <0 if some buffers have failed
* to be written out.
*
* Called with j_list_lock held.
*/
static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
{
struct journal_head *jh;
struct buffer_head *bh;
tid_t this_tid;
int released = 0;
int ret = 0;
this_tid = transaction->t_tid;
restart:
/* Did somebody clean up the transaction in the meanwhile? */
if (journal->j_checkpoint_transactions != transaction ||
transaction->t_tid != this_tid)
return ret;
while (!released && transaction->t_checkpoint_io_list) {
jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
spin_lock(&journal->j_list_lock);
goto restart;
}
get_bh(bh);
if (buffer_locked(bh)) {
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
spin_lock(&journal->j_list_lock);
goto restart;
}
if (unlikely(buffer_write_io_error(bh)))
ret = -EIO;
/*
* Now in whatever state the buffer currently is, we know that
* it has been written out and so we can drop it from the list
*/
released = __journal_remove_checkpoint(jh);
jbd_unlock_bh_state(bh);
__brelse(bh);
}
return ret;
}
#define NR_BATCH 64
static void
__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
{
int i;
struct blk_plug plug;
blk_start_plug(&plug);
for (i = 0; i < *batch_count; i++)
write_dirty_buffer(bhs[i], WRITE_SYNC);
blk_finish_plug(&plug);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = bhs[i];
clear_buffer_jwrite(bh);
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
}
*batch_count = 0;
}
/*
* Try to flush one buffer from the checkpoint list to disk.
*
* Return 1 if something happened which requires us to abort the current
* scan of the checkpoint list. Return <0 if the buffer has failed to
* be written out.
*
* Called with j_list_lock held and drops it if 1 is returned
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __process_buffer(journal_t *journal, struct journal_head *jh,
struct buffer_head **bhs, int *batch_count)
{
struct buffer_head *bh = jh2bh(jh);
int ret = 0;
if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
ret = 1;
} else if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
log_start_commit(journal, tid);
log_wait_commit(journal, tid);
ret = 1;
} else if (!buffer_dirty(bh)) {
ret = 1;
if (unlikely(buffer_write_io_error(bh)))
ret = -EIO;
get_bh(bh);
J_ASSERT_JH(jh, !buffer_jbddirty(bh));
BUFFER_TRACE(bh, "remove from checkpoint");
__journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
__brelse(bh);
} else {
/*
* Important: we are about to write the buffer, and
* possibly block, while still holding the journal lock.
* We cannot afford to let the transaction logic start
* messing around with this buffer before we write it to
* disk, as that would break recoverability.
*/
BUFFER_TRACE(bh, "queue");
get_bh(bh);
J_ASSERT_BH(bh, !buffer_jwrite(bh));
set_buffer_jwrite(bh);
bhs[*batch_count] = bh;
__buffer_relink_io(jh);
jbd_unlock_bh_state(bh);
(*batch_count)++;
if (*batch_count == NR_BATCH) {
spin_unlock(&journal->j_list_lock);
__flush_batch(journal, bhs, batch_count);
ret = 1;
}
}
return ret;
}
/*
* Perform an actual checkpoint. We take the first transaction on the
* list of transactions to be checkpointed and send all its buffers
* to disk. We submit larger chunks of data at once.
*
* The journal should be locked before calling this function.
* Called with j_checkpoint_mutex held.
*/
int log_do_checkpoint(journal_t *journal)
{
transaction_t *transaction;
tid_t this_tid;
int result;
jbd_debug(1, "Start checkpoint\n");
/*
* First thing: if there are any transactions in the log which
* don't need checkpointing, just eliminate them from the
* journal straight away.
*/
result = cleanup_journal_tail(journal);
trace_jbd_checkpoint(journal, result);
jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
/*
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
result = 0;
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
transaction = journal->j_checkpoint_transactions;
this_tid = transaction->t_tid;
restart:
/*
* If someone cleaned up this transaction while we slept, we're
* done (maybe it's a new transaction, but it fell at the same
* address).
*/
if (journal->j_checkpoint_transactions == transaction &&
transaction->t_tid == this_tid) {
int batch_count = 0;
struct buffer_head *bhs[NR_BATCH];
struct journal_head *jh;
int retry = 0, err;
while (!retry && transaction->t_checkpoint_list) {
struct buffer_head *bh;
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
retry = 1;
break;
}
retry = __process_buffer(journal, jh, bhs,&batch_count);
if (retry < 0 && !result)
result = retry;
if (!retry && (need_resched() ||
spin_needbreak(&journal->j_list_lock))) {
spin_unlock(&journal->j_list_lock);
retry = 1;
break;
}
}
if (batch_count) {
if (!retry) {
spin_unlock(&journal->j_list_lock);
retry = 1;
}
__flush_batch(journal, bhs, &batch_count);
}
if (retry) {
spin_lock(&journal->j_list_lock);
goto restart;
}
/*
* Now we have cleaned up the first transaction's checkpoint
* list. Let's clean up the second one
*/
err = __wait_cp_io(journal, transaction);
if (!result)
result = err;
}
out:
spin_unlock(&journal->j_list_lock);
if (result < 0)
journal_abort(journal, result);
else
result = cleanup_journal_tail(journal);
return (result < 0) ? result : 0;
}
/*
* Check the list of checkpoint transactions for the journal to see if
* we have already got rid of any since the last update of the log tail
* in the journal superblock. If so, we can instantly roll the
* superblock forward to remove those transactions from the log.
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
* even in abort state, but we must not update the super block if
* checkpointing may have failed. Otherwise, we would lose some metadata
* buffers which should be written-back to the filesystem.
*/
int cleanup_journal_tail(journal_t *journal)
{
transaction_t * transaction;
tid_t first_tid;
unsigned int blocknr, freed;
if (is_journal_aborted(journal))
return 1;
/*
* OK, work out the oldest transaction remaining in the log, and
* the log block it starts at.
*
* If the log is now empty, we need to work out which is the
* next transaction ID we will write, and where it will
* start.
*/
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
transaction = journal->j_checkpoint_transactions;
if (transaction) {
first_tid = transaction->t_tid;
blocknr = transaction->t_log_start;
} else if ((transaction = journal->j_committing_transaction) != NULL) {
first_tid = transaction->t_tid;
blocknr = transaction->t_log_start;
} else if ((transaction = journal->j_running_transaction) != NULL) {
first_tid = transaction->t_tid;
blocknr = journal->j_head;
} else {
first_tid = journal->j_transaction_sequence;
blocknr = journal->j_head;
}
spin_unlock(&journal->j_list_lock);
J_ASSERT(blocknr != 0);
/* If the oldest pinned transaction is at the tail of the log
already then there's not much we can do right now. */
if (journal->j_tail_sequence == first_tid) {
spin_unlock(&journal->j_state_lock);
return 1;
}
spin_unlock(&journal->j_state_lock);
/*
* We need to make sure that any blocks that were recently written out
* --- perhaps by log_do_checkpoint() --- are flushed out before we
* drop the transactions from the journal. Similarly we need to be sure
* superblock makes it to disk before next transaction starts reusing
* freed space (otherwise we could replay some blocks of the new
* transaction thinking they belong to the old one). So we use
* WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially
* with an appropriately sized journal, but we need this to guarantee
* correctness. Fortunately cleanup_journal_tail() doesn't get called
* all that often.
*/
journal_update_sb_log_tail(journal, first_tid, blocknr,
WRITE_FLUSH_FUA);
spin_lock(&journal->j_state_lock);
/* OK, update the superblock to recover the freed space.
* Physical blocks come first: have we wrapped beyond the end of
* the log? */
freed = blocknr - journal->j_tail;
if (blocknr < journal->j_tail)
freed = freed + journal->j_last - journal->j_first;
trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed);
jbd_debug(1,
"Cleaning journal tail from %d to %d (offset %u), "
"freeing %u\n",
journal->j_tail_sequence, first_tid, blocknr, freed);
journal->j_free += freed;
journal->j_tail_sequence = first_tid;
journal->j_tail = blocknr;
spin_unlock(&journal->j_state_lock);
return 0;
}
/* Checkpoint list management */
/*
* journal_clean_one_cp_list
*
* Find all the written-back checkpoint buffers in the given list and release
* them.
*
* Called with j_list_lock held.
* Returns number of buffers reaped (for debug)
*/
static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret, freed = 0;
*released = 0;
if (!jh)
return 0;
last_jh = jh->b_cpprev;
do {
jh = next_jh;
next_jh = jh->b_cpnext;
/* Use trylock because of the ranking */
if (jbd_trylock_bh_state(jh2bh(jh))) {
ret = __try_to_free_cp_buf(jh);
if (ret) {
freed++;
if (ret == 2) {
*released = 1;
return freed;
}
}
}
/*
* This function only frees up some memory
* if possible so we dont have an obligation
* to finish processing. Bail out if preemption
* requested:
*/
if (need_resched())
return freed;
} while (jh != last_jh);
return freed;
}
/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
*
* Called with the journal locked.
* Called with j_list_lock held.
* Returns number of buffers reaped (for debug)
*/
int __journal_clean_checkpoint_list(journal_t *journal)
{
transaction_t *transaction, *last_transaction, *next_transaction;
int ret = 0;
int released;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
goto out;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
ret += journal_clean_one_cp_list(transaction->
t_checkpoint_list, &released);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
* preemption requested:
*/
if (need_resched())
goto out;
if (released)
continue;
/*
* It is essential that we are as careful as in the case of
* t_checkpoint_list with removing the buffer from the list as
* we can possibly see not yet submitted buffers on io_list
*/
ret += journal_clean_one_cp_list(transaction->
t_checkpoint_io_list, &released);
if (need_resched())
goto out;
} while (transaction != last_transaction);
out:
return ret;
}
/*
* journal_remove_checkpoint: called after a buffer has been committed
* to disk (either by being write-back flushed to disk, or being
* committed to the log).
*
* We cannot safely clean a transaction out of the log until all of the
* buffer updates committed in that transaction have safely been stored
* elsewhere on disk. To achieve this, all of the buffers in a
* transaction need to be maintained on the transaction's checkpoint
* lists until they have been rewritten, at which point this function is
* called to remove the buffer from the existing transaction's
* checkpoint lists.
*
* The function returns 1 if it frees the transaction, 0 otherwise.
* The function can free jh and bh.
*
* This function is called with j_list_lock held.
* This function is called with jbd_lock_bh_state(jh2bh(jh))
*/
int __journal_remove_checkpoint(struct journal_head *jh)
{
transaction_t *transaction;
journal_t *journal;
int ret = 0;
JBUFFER_TRACE(jh, "entry");
if ((transaction = jh->b_cp_transaction) == NULL) {
JBUFFER_TRACE(jh, "not on transaction");
goto out;
}
journal = transaction->t_journal;
JBUFFER_TRACE(jh, "removing from transaction");
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
journal_put_journal_head(jh);
if (transaction->t_checkpoint_list != NULL ||
transaction->t_checkpoint_io_list != NULL)
goto out;
/*
* There is one special case to worry about: if we have just pulled the
* buffer off a running or committing transaction's checkpoing list,
* then even if the checkpoint list is empty, the transaction obviously
* cannot be dropped!
*
* The locking here around t_state is a bit sleazy.
* See the comment at the end of journal_commit_transaction().
*/
if (transaction->t_state != T_FINISHED)
goto out;
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
__journal_drop_transaction(journal, transaction);
/* Just in case anybody was waiting for more transactions to be
checkpointed... */
wake_up(&journal->j_wait_logspace);
ret = 1;
out:
return ret;
}
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
* the log.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __journal_insert_checkpoint(struct journal_head *jh,
transaction_t *transaction)
{
JBUFFER_TRACE(jh, "entry");
J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
/* Get reference for checkpointing transaction */
journal_grab_journal_head(jh2bh(jh));
jh->b_cp_transaction = transaction;
if (!transaction->t_checkpoint_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_list;
jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_list = jh;
}
/*
* We've finished with this transaction structure: adios...
*
* The transaction must have no links except for the checkpoint by this
* point.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
{
assert_spin_locked(&journal->j_list_lock);
if (transaction->t_cpnext) {
transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions =
transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions = NULL;
}
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(transaction->t_updates == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
trace_jbd_drop_transaction(journal, transaction);
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
kfree(transaction);
}

1021
fs/jbd/commit.c Normal file

File diff suppressed because it is too large Load diff

2146
fs/jbd/journal.c Normal file

File diff suppressed because it is too large Load diff

594
fs/jbd/recovery.c Normal file
View file

@ -0,0 +1,594 @@
/*
* linux/fs/jbd/recovery.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999-2000 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal recovery routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/blkdev.h>
#endif
/*
* Maintain information about the progress of the recovery job, so that
* the different passes can carry information between them.
*/
struct recovery_info
{
tid_t start_transaction;
tid_t end_transaction;
int nr_replays;
int nr_revokes;
int nr_revoke_hits;
};
enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
tid_t, struct recovery_info *);
#ifdef __KERNEL__
/* Release readahead buffers after use */
static void journal_brelse_array(struct buffer_head *b[], int n)
{
while (--n >= 0)
brelse (b[n]);
}
/*
* When reading from the journal, we are going through the block device
* layer directly and so there is no readahead being done for us. We
* need to implement any readahead ourselves if we want it to happen at
* all. Recovery is basically one long sequential read, so make sure we
* do the IO in reasonably large chunks.
*
* This is not so critical that we need to be enormously clever about
* the readahead size, though. 128K is a purely arbitrary, good-enough
* fixed value.
*/
#define MAXBUF 8
static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
unsigned int blocknr;
struct buffer_head *bh;
struct buffer_head * bufs[MAXBUF];
/* Do up to 128K of readahead */
max = start + (128 * 1024 / journal->j_blocksize);
if (max > journal->j_maxlen)
max = journal->j_maxlen;
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
* a time to the block device IO layer. */
nbufs = 0;
for (next = start; next < max; next++) {
err = journal_bmap(journal, next, &blocknr);
if (err) {
printk (KERN_ERR "JBD: bad block at offset %u\n",
next);
goto failed;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
err = -ENOMEM;
goto failed;
}
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
ll_rw_block(READ, nbufs, bufs);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
} else
brelse(bh);
}
if (nbufs)
ll_rw_block(READ, nbufs, bufs);
err = 0;
failed:
if (nbufs)
journal_brelse_array(bufs, nbufs);
return err;
}
#endif /* __KERNEL__ */
/*
* Read a block from the journal
*/
static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
unsigned int blocknr;
struct buffer_head *bh;
*bhp = NULL;
if (offset >= journal->j_maxlen) {
printk(KERN_ERR "JBD: corrupted journal superblock\n");
return -EIO;
}
err = journal_bmap(journal, offset, &blocknr);
if (err) {
printk (KERN_ERR "JBD: bad block at offset %u\n",
offset);
return err;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
return -ENOMEM;
if (!buffer_uptodate(bh)) {
/* If this is a brand new buffer, start readahead.
Otherwise, we assume we are already reading it. */
if (!buffer_req(bh))
do_readahead(journal, offset);
wait_on_buffer(bh);
}
if (!buffer_uptodate(bh)) {
printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
offset);
brelse(bh);
return -EIO;
}
*bhp = bh;
return 0;
}
/*
* Count the number of in-use tags in a journal descriptor block.
*/
static int count_tags(struct buffer_head *bh, int size)
{
char * tagp;
journal_block_tag_t * tag;
int nr = 0;
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
tag = (journal_block_tag_t *) tagp;
nr++;
tagp += sizeof(journal_block_tag_t);
if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
tagp += 16;
if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
break;
}
return nr;
}
/* Make sure we wrap around the log correctly! */
#define wrap(journal, var) \
do { \
if (var >= (journal)->j_last) \
var -= ((journal)->j_last - (journal)->j_first); \
} while (0)
/**
* journal_recover - recovers a on-disk journal
* @journal: the journal to recover
*
* The primary function for recovering the log contents when mounting a
* journaled device.
*
* Recovery is done in three passes. In the first pass, we look for the
* end of the log. In the second, we assemble the list of revoke
* blocks. In the third and final pass, we replay any un-revoked blocks
* in the log.
*/
int journal_recover(journal_t *journal)
{
int err, err2;
journal_superblock_t * sb;
struct recovery_info info;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
/*
* The journal superblock's s_start field (the current log head)
* is always zero if, and only if, the journal was cleanly
* unmounted.
*/
if (!sb->s_start) {
jbd_debug(1, "No recovery required, last transaction %d\n",
be32_to_cpu(sb->s_sequence));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
return 0;
}
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
jbd_debug(1, "JBD: recovery, exit status %d, "
"recovered transactions %u to %u\n",
err, info.start_transaction, info.end_transaction);
jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
/* Restart the log at the next transaction ID, thus invalidating
* any existing commit records in the log. */
journal->j_transaction_sequence = ++info.end_transaction;
journal_clear_revoke(journal);
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
/* Flush disk caches to get replayed data on the permanent storage */
if (journal->j_flags & JFS_BARRIER) {
err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
if (!err)
err = err2;
}
return err;
}
/**
* journal_skip_recovery - Start journal and wipe exiting records
* @journal: journal to startup
*
* Locate any valid recovery information from the journal and set up the
* journal structures in memory to ignore it (presumably because the
* caller has evidence that it is out of date).
* This function does'nt appear to be exorted..
*
* We perform one pass over the journal to allow us to tell the user how
* much recovery information is being erased, and to let us initialise
* the journal transaction sequence numbers to the next unused ID.
*/
int journal_skip_recovery(journal_t *journal)
{
int err;
struct recovery_info info;
memset (&info, 0, sizeof(info));
err = do_one_pass(journal, &info, PASS_SCAN);
if (err) {
printk(KERN_ERR "JBD: error %d scanning journal\n", err);
++journal->j_transaction_sequence;
} else {
#ifdef CONFIG_JBD_DEBUG
int dropped = info.end_transaction -
be32_to_cpu(journal->j_superblock->s_sequence);
jbd_debug(1,
"JBD: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s");
#endif
journal->j_transaction_sequence = ++info.end_transaction;
}
journal->j_tail = 0;
return err;
}
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
unsigned int next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
struct buffer_head * bh;
unsigned int sequence;
int blocktype;
/*
* First thing is to establish what we expect to find in the log
* (in terms of transaction IDs), and where (in terms of log
* block offsets): query the superblock.
*/
sb = journal->j_superblock;
next_commit_ID = be32_to_cpu(sb->s_sequence);
next_log_block = be32_to_cpu(sb->s_start);
first_commit_ID = next_commit_ID;
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
jbd_debug(1, "Starting recovery pass %d\n", pass);
/*
* Now we walk through the log, transaction by transaction,
* making sure that each transaction has a commit block in the
* expected place. Each complete transaction gets replayed back
* into the main filesystem.
*/
while (1) {
int flags;
char * tagp;
journal_block_tag_t * tag;
struct buffer_head * obh;
struct buffer_head * nbh;
cond_resched();
/* If we already know where to stop the log traversal,
* check right now that we haven't gone past the end of
* the log. */
if (pass != PASS_SCAN)
if (tid_geq(next_commit_ID, info->end_transaction))
break;
jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
jbd_debug(3, "JBD: checking block %u\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
next_log_block++;
wrap(journal, next_log_block);
/* What kind of buffer is it?
*
* If it is a descriptor block, check that it has the
* expected sequence number. Otherwise, we're all done
* here. */
tmp = (journal_header_t *)bh->b_data;
if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
brelse(bh);
break;
}
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
jbd_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
if (sequence != next_commit_ID) {
brelse(bh);
break;
}
/* OK, we have a valid descriptor block which matches
* all of the sequence number checks. What are we going
* to do with it? That depends on the pass... */
switch(blocktype) {
case JFS_DESCRIPTOR_BLOCK:
/* If it is a valid descriptor block, replay it
* in pass REPLAY; otherwise, just skip over the
* blocks it describes. */
if (pass != PASS_REPLAY) {
next_log_block +=
count_tags(bh, journal->j_blocksize);
wrap(journal, next_log_block);
brelse(bh);
continue;
}
/* A descriptor block: we can now write all of
* the data blocks. Yay, useful work is finally
* getting done here! */
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
<= journal->j_blocksize) {
unsigned int io_block;
tag = (journal_block_tag_t *) tagp;
flags = be32_to_cpu(tag->t_flags);
io_block = next_log_block++;
wrap(journal, next_log_block);
err = jread(&obh, journal, io_block);
if (err) {
/* Recover what we can, but
* report failure at the end. */
success = err;
printk (KERN_ERR
"JBD: IO error %d recovering "
"block %u in log\n",
err, io_block);
} else {
unsigned int blocknr;
J_ASSERT(obh != NULL);
blocknr = be32_to_cpu(tag->t_blocknr);
/* If the block has been
* revoked, then we're all done
* here. */
if (journal_test_revoke
(journal, blocknr,
next_commit_ID)) {
brelse(obh);
++info->nr_revoke_hits;
goto skip_write;
}
/* Find a buffer for the new
* data being restored */
nbh = __getblk(journal->j_fs_dev,
blocknr,
journal->j_blocksize);
if (nbh == NULL) {
printk(KERN_ERR
"JBD: Out of memory "
"during recovery.\n");
err = -ENOMEM;
brelse(bh);
brelse(obh);
goto failed;
}
lock_buffer(nbh);
memcpy(nbh->b_data, obh->b_data,
journal->j_blocksize);
if (flags & JFS_FLAG_ESCAPE) {
*((__be32 *)nbh->b_data) =
cpu_to_be32(JFS_MAGIC_NUMBER);
}
BUFFER_TRACE(nbh, "marking dirty");
set_buffer_uptodate(nbh);
mark_buffer_dirty(nbh);
BUFFER_TRACE(nbh, "marking uptodate");
++info->nr_replays;
/* ll_rw_block(WRITE, 1, &nbh); */
unlock_buffer(nbh);
brelse(obh);
brelse(nbh);
}
skip_write:
tagp += sizeof(journal_block_tag_t);
if (!(flags & JFS_FLAG_SAME_UUID))
tagp += 16;
if (flags & JFS_FLAG_LAST_TAG)
break;
}
brelse(bh);
continue;
case JFS_COMMIT_BLOCK:
/* Found an expected commit block: not much to
* do other than move on to the next sequence
* number. */
brelse(bh);
next_commit_ID++;
continue;
case JFS_REVOKE_BLOCK:
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
brelse(bh);
continue;
}
err = scan_revoke_records(journal, bh,
next_commit_ID, info);
brelse(bh);
if (err)
goto failed;
continue;
default:
jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
brelse(bh);
goto done;
}
}
done:
/*
* We broke out of the log scan loop: either we came to the
* known end of the log or we found an unexpected block in the
* log. If the latter happened, then we know that the "current"
* transaction marks the end of the valid log.
*/
if (pass == PASS_SCAN)
info->end_transaction = next_commit_ID;
else {
/* It's really bad news if different passes end up at
* different places (but possible due to IO errors). */
if (info->end_transaction != next_commit_ID) {
printk (KERN_ERR "JBD: recovery pass %d ended at "
"transaction %u, expected %u\n",
pass, next_commit_ID, info->end_transaction);
if (!success)
success = -EIO;
}
}
return success;
failed:
return err;
}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
tid_t sequence, struct recovery_info *info)
{
journal_revoke_header_t *header;
int offset, max;
header = (journal_revoke_header_t *) bh->b_data;
offset = sizeof(journal_revoke_header_t);
max = be32_to_cpu(header->r_count);
while (offset < max) {
unsigned int blocknr;
int err;
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
offset += 4;
err = journal_set_revoke(journal, blocknr, sequence);
if (err)
return err;
++info->nr_revokes;
}
return 0;
}

733
fs/jbd/revoke.c Normal file
View file

@ -0,0 +1,733 @@
/*
* linux/fs/jbd/revoke.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 2000
*
* Copyright 2000 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal revoke routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*
* Revoke is the mechanism used to prevent old log records for deleted
* metadata from being replayed on top of newer data using the same
* blocks. The revoke mechanism is used in two separate places:
*
* + Commit: during commit we write the entire list of the current
* transaction's revoked blocks to the journal
*
* + Recovery: during recovery we record the transaction ID of all
* revoked blocks. If there are multiple revoke records in the log
* for a single block, only the last one counts, and if there is a log
* entry for a block beyond the last revoke, then that log entry still
* gets replayed.
*
* We can get interactions between revokes and new log data within a
* single transaction:
*
* Block is revoked and then journaled:
* The desired end result is the journaling of the new block, so we
* cancel the revoke before the transaction commits.
*
* Block is journaled and then revoked:
* The revoke must take precedence over the write of the block, so we
* need either to cancel the journal entry or to write the revoke
* later in the log than the log block. In this case, we choose the
* latter: journaling a block cancels any revoke record for that block
* in the current transaction, so any revoke for that block in the
* transaction must have happened after the block was journaled and so
* the revoke must take precedence.
*
* Block is revoked and then written as data:
* The data write is allowed to succeed, but the revoke is _not_
* cancelled. We still need to prevent old log records from
* overwriting the new data. We don't even need to clear the revoke
* bit here.
*
* We cache revoke status of a buffer in the current transaction in b_states
* bits. As the name says, revokevalid flag indicates that the cached revoke
* status of a buffer is valid and we can rely on the cached status.
*
* Revoke information on buffers is a tri-state value:
*
* RevokeValid clear: no cached revoke status, need to look it up
* RevokeValid set, Revoked clear:
* buffer has not been revoked, and cancel_revoke
* need do nothing.
* RevokeValid set, Revoked set:
* buffer has been revoked.
*
* Locking rules:
* We keep two hash tables of revoke records. One hashtable belongs to the
* running transaction (is pointed to by journal->j_revoke), the other one
* belongs to the committing transaction. Accesses to the second hash table
* happen only from the kjournald and no other thread touches this table. Also
* journal_switch_revoke_table() which switches which hashtable belongs to the
* running and which to the committing transaction is called only from
* kjournald. Therefore we need no locks when accessing the hashtable belonging
* to the committing transaction.
*
* All users operating on the hash table belonging to the running transaction
* have a handle to the transaction. Therefore they are safe from kjournald
* switching hash tables under them. For operations on the lists of entries in
* the hash table j_revoke_lock is used.
*
* Finally, also replay code uses the hash tables but at this moment no one else
* can touch them (filesystem isn't mounted yet) and hence no locking is
* needed.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/bio.h>
#endif
#include <linux/log2.h>
#include <linux/hash.h>
static struct kmem_cache *revoke_record_cache;
static struct kmem_cache *revoke_table_cache;
/* Each revoke record represents one single revoked block. During
journal replay, this involves recording the transaction ID of the
last transaction to revoke this block. */
struct jbd_revoke_record_s
{
struct list_head hash;
tid_t sequence; /* Used for recovery only */
unsigned int blocknr;
};
/* The revoke table is just a simple hash table of revoke records. */
struct jbd_revoke_table_s
{
/* It is conceivable that we might want a larger hash table
* for recovery. Must be a power of two. */
int hash_size;
int hash_shift;
struct list_head *hash_table;
};
#ifdef __KERNEL__
static void write_one_revoke_record(journal_t *, transaction_t *,
struct journal_head **, int *,
struct jbd_revoke_record_s *, int);
static void flush_descriptor(journal_t *, struct journal_head *, int, int);
#endif
/* Utility functions to maintain the revoke table */
static inline int hash(journal_t *journal, unsigned int block)
{
struct jbd_revoke_table_s *table = journal->j_revoke;
return hash_32(block, table->hash_shift);
}
static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
tid_t seq)
{
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
repeat:
record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
if (!record)
goto oom;
record->sequence = seq;
record->blocknr = blocknr;
hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
spin_lock(&journal->j_revoke_lock);
list_add(&record->hash, hash_list);
spin_unlock(&journal->j_revoke_lock);
return 0;
oom:
if (!journal_oom_retry)
return -ENOMEM;
jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
yield();
goto repeat;
}
/* Find a revoke record in the journal's hash table. */
static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
unsigned int blocknr)
{
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
spin_lock(&journal->j_revoke_lock);
record = (struct jbd_revoke_record_s *) hash_list->next;
while (&(record->hash) != hash_list) {
if (record->blocknr == blocknr) {
spin_unlock(&journal->j_revoke_lock);
return record;
}
record = (struct jbd_revoke_record_s *) record->hash.next;
}
spin_unlock(&journal->j_revoke_lock);
return NULL;
}
void journal_destroy_revoke_caches(void)
{
if (revoke_record_cache) {
kmem_cache_destroy(revoke_record_cache);
revoke_record_cache = NULL;
}
if (revoke_table_cache) {
kmem_cache_destroy(revoke_table_cache);
revoke_table_cache = NULL;
}
}
int __init journal_init_revoke_caches(void)
{
J_ASSERT(!revoke_record_cache);
J_ASSERT(!revoke_table_cache);
revoke_record_cache = kmem_cache_create("revoke_record",
sizeof(struct jbd_revoke_record_s),
0,
SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
NULL);
if (!revoke_record_cache)
goto record_cache_failure;
revoke_table_cache = kmem_cache_create("revoke_table",
sizeof(struct jbd_revoke_table_s),
0, SLAB_TEMPORARY, NULL);
if (!revoke_table_cache)
goto table_cache_failure;
return 0;
table_cache_failure:
journal_destroy_revoke_caches();
record_cache_failure:
return -ENOMEM;
}
static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
{
int i;
struct jbd_revoke_table_s *table;
table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
if (!table)
goto out;
table->hash_size = hash_size;
table->hash_shift = ilog2(hash_size);
table->hash_table =
kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
if (!table->hash_table) {
kmem_cache_free(revoke_table_cache, table);
table = NULL;
goto out;
}
for (i = 0; i < hash_size; i++)
INIT_LIST_HEAD(&table->hash_table[i]);
out:
return table;
}
static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
{
int i;
struct list_head *hash_list;
for (i = 0; i < table->hash_size; i++) {
hash_list = &table->hash_table[i];
J_ASSERT(list_empty(hash_list));
}
kfree(table->hash_table);
kmem_cache_free(revoke_table_cache, table);
}
/* Initialise the revoke table for a given journal to a given size. */
int journal_init_revoke(journal_t *journal, int hash_size)
{
J_ASSERT(journal->j_revoke_table[0] == NULL);
J_ASSERT(is_power_of_2(hash_size));
journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
if (!journal->j_revoke_table[0])
goto fail0;
journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
if (!journal->j_revoke_table[1])
goto fail1;
journal->j_revoke = journal->j_revoke_table[1];
spin_lock_init(&journal->j_revoke_lock);
return 0;
fail1:
journal_destroy_revoke_table(journal->j_revoke_table[0]);
fail0:
return -ENOMEM;
}
/* Destroy a journal's revoke table. The table must already be empty! */
void journal_destroy_revoke(journal_t *journal)
{
journal->j_revoke = NULL;
if (journal->j_revoke_table[0])
journal_destroy_revoke_table(journal->j_revoke_table[0]);
if (journal->j_revoke_table[1])
journal_destroy_revoke_table(journal->j_revoke_table[1]);
}
#ifdef __KERNEL__
/*
* journal_revoke: revoke a given buffer_head from the journal. This
* prevents the block from being replayed during recovery if we take a
* crash after this current transaction commits. Any subsequent
* metadata writes of the buffer in this transaction cancel the
* revoke.
*
* Note that this call may block --- it is up to the caller to make
* sure that there are no further calls to journal_write_metadata
* before the revoke is complete. In ext3, this implies calling the
* revoke before clearing the block bitmap when we are deleting
* metadata.
*
* Revoke performs a journal_forget on any buffer_head passed in as a
* parameter, but does _not_ forget the buffer_head if the bh was only
* found implicitly.
*
* bh_in may not be a journalled buffer - it may have come off
* the hash tables without an attached journal_head.
*
* If bh_in is non-zero, journal_revoke() will decrement its b_count
* by one.
*/
int journal_revoke(handle_t *handle, unsigned int blocknr,
struct buffer_head *bh_in)
{
struct buffer_head *bh = NULL;
journal_t *journal;
struct block_device *bdev;
int err;
might_sleep();
if (bh_in)
BUFFER_TRACE(bh_in, "enter");
journal = handle->h_transaction->t_journal;
if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
J_ASSERT (!"Cannot set revoke feature!");
return -EINVAL;
}
bdev = journal->j_fs_dev;
bh = bh_in;
if (!bh) {
bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
#ifdef JBD_EXPENSIVE_CHECKING
else {
struct buffer_head *bh2;
/* If there is a different buffer_head lying around in
* memory anywhere... */
bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
/* ...then it better be revoked too,
* since it's illegal to create a revoke
* record against a buffer_head which is
* not marked revoked --- that would
* risk missing a subsequent revoke
* cancel. */
J_ASSERT_BH(bh2, buffer_revoked(bh2));
put_bh(bh2);
}
}
#endif
/* We really ought not ever to revoke twice in a row without
first having the revoke cancelled: it's illegal to free a
block twice without allocating it in between! */
if (bh) {
if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
"inconsistent data on disk")) {
if (!bh_in)
brelse(bh);
return -EIO;
}
set_buffer_revoked(bh);
set_buffer_revokevalid(bh);
if (bh_in) {
BUFFER_TRACE(bh_in, "call journal_forget");
journal_forget(handle, bh_in);
} else {
BUFFER_TRACE(bh, "call brelse");
__brelse(bh);
}
}
jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
return err;
}
/*
* Cancel an outstanding revoke. For use only internally by the
* journaling code (called from journal_get_write_access).
*
* We trust buffer_revoked() on the buffer if the buffer is already
* being journaled: if there is no revoke pending on the buffer, then we
* don't do anything here.
*
* This would break if it were possible for a buffer to be revoked and
* discarded, and then reallocated within the same transaction. In such
* a case we would have lost the revoked bit, but when we arrived here
* the second time we would still have a pending revoke to cancel. So,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
*/
int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
struct jbd_revoke_record_s *record;
journal_t *journal = handle->h_transaction->t_journal;
int need_cancel;
int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
/* Is the existing Revoke bit valid? If so, we trust it, and
* only perform the full cancel if the revoke bit is set. If
* not, we can't trust the revoke bit, and we need to do the
* full search for a revoke record. */
if (test_set_buffer_revokevalid(bh)) {
need_cancel = test_clear_buffer_revoked(bh);
} else {
need_cancel = 1;
clear_buffer_revoked(bh);
}
if (need_cancel) {
record = find_revoke_record(journal, bh->b_blocknr);
if (record) {
jbd_debug(4, "cancelled existing revoke on "
"blocknr %llu\n", (unsigned long long)bh->b_blocknr);
spin_lock(&journal->j_revoke_lock);
list_del(&record->hash);
spin_unlock(&journal->j_revoke_lock);
kmem_cache_free(revoke_record_cache, record);
did_revoke = 1;
}
}
#ifdef JBD_EXPENSIVE_CHECKING
/* There better not be one left behind by now! */
record = find_revoke_record(journal, bh->b_blocknr);
J_ASSERT_JH(jh, record == NULL);
#endif
/* Finally, have we just cleared revoke on an unhashed
* buffer_head? If so, we'd better make sure we clear the
* revoked status on any hashed alias too, otherwise the revoke
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
__brelse(bh2);
}
}
return did_revoke;
}
/*
* journal_clear_revoked_flags clears revoked flag of buffers in
* revoke table to reflect there is no revoked buffer in the next
* transaction which is going to be started.
*/
void journal_clear_buffer_revoked_flags(journal_t *journal)
{
struct jbd_revoke_table_s *revoke = journal->j_revoke;
int i = 0;
for (i = 0; i < revoke->hash_size; i++) {
struct list_head *hash_list;
struct list_head *list_entry;
hash_list = &revoke->hash_table[i];
list_for_each(list_entry, hash_list) {
struct jbd_revoke_record_s *record;
struct buffer_head *bh;
record = (struct jbd_revoke_record_s *)list_entry;
bh = __find_get_block(journal->j_fs_dev,
record->blocknr,
journal->j_blocksize);
if (bh) {
clear_buffer_revoked(bh);
__brelse(bh);
}
}
}
}
/* journal_switch_revoke table select j_revoke for next transaction
* we do not want to suspend any processing until all revokes are
* written -bzzz
*/
void journal_switch_revoke_table(journal_t *journal)
{
int i;
if (journal->j_revoke == journal->j_revoke_table[0])
journal->j_revoke = journal->j_revoke_table[1];
else
journal->j_revoke = journal->j_revoke_table[0];
for (i = 0; i < journal->j_revoke->hash_size; i++)
INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
}
/*
* Write revoke records to the journal for all entries in the current
* revoke hash, deleting the entries as we go.
*/
void journal_write_revoke_records(journal_t *journal,
transaction_t *transaction, int write_op)
{
struct journal_head *descriptor;
struct jbd_revoke_record_s *record;
struct jbd_revoke_table_s *revoke;
struct list_head *hash_list;
int i, offset, count;
descriptor = NULL;
offset = 0;
count = 0;
/* select revoke table for committing transaction */
revoke = journal->j_revoke == journal->j_revoke_table[0] ?
journal->j_revoke_table[1] : journal->j_revoke_table[0];
for (i = 0; i < revoke->hash_size; i++) {
hash_list = &revoke->hash_table[i];
while (!list_empty(hash_list)) {
record = (struct jbd_revoke_record_s *)
hash_list->next;
write_one_revoke_record(journal, transaction,
&descriptor, &offset,
record, write_op);
count++;
list_del(&record->hash);
kmem_cache_free(revoke_record_cache, record);
}
}
if (descriptor)
flush_descriptor(journal, descriptor, offset, write_op);
jbd_debug(1, "Wrote %d revoke records\n", count);
}
/*
* Write out one revoke record. We need to create a new descriptor
* block if the old one is full or if we have not already created one.
*/
static void write_one_revoke_record(journal_t *journal,
transaction_t *transaction,
struct journal_head **descriptorp,
int *offsetp,
struct jbd_revoke_record_s *record,
int write_op)
{
struct journal_head *descriptor;
int offset;
journal_header_t *header;
/* If we are already aborting, this all becomes a noop. We
still need to go round the loop in
journal_write_revoke_records in order to free all of the
revoke records: only the IO to the journal is omitted. */
if (is_journal_aborted(journal))
return;
descriptor = *descriptorp;
offset = *offsetp;
/* Make sure we have a descriptor with space left for the record */
if (descriptor) {
if (offset == journal->j_blocksize) {
flush_descriptor(journal, descriptor, offset, write_op);
descriptor = NULL;
}
}
if (!descriptor) {
descriptor = journal_get_descriptor_buffer(journal);
if (!descriptor)
return;
header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
journal_file_buffer(descriptor, transaction, BJ_LogCtl);
offset = sizeof(journal_revoke_header_t);
*descriptorp = descriptor;
}
* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
cpu_to_be32(record->blocknr);
offset += 4;
*offsetp = offset;
}
/*
* Flush a revoke descriptor out to the journal. If we are aborting,
* this is a noop; otherwise we are generating a buffer which needs to
* be waited for during commit, so it has to go onto the appropriate
* journal buffer list.
*/
static void flush_descriptor(journal_t *journal,
struct journal_head *descriptor,
int offset, int write_op)
{
journal_revoke_header_t *header;
struct buffer_head *bh = jh2bh(descriptor);
if (is_journal_aborted(journal)) {
put_bh(bh);
return;
}
header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
header->r_count = cpu_to_be32(offset);
set_buffer_jwrite(bh);
BUFFER_TRACE(bh, "write");
set_buffer_dirty(bh);
write_dirty_buffer(bh, write_op);
}
#endif
/*
* Revoke support for recovery.
*
* Recovery needs to be able to:
*
* record all revoke records, including the tid of the latest instance
* of each revoke in the journal
*
* check whether a given block in a given transaction should be replayed
* (ie. has not been revoked by a revoke record in that or a subsequent
* transaction)
*
* empty the revoke table after recovery.
*/
/*
* First, setting revoke records. We create a new revoke record for
* every block ever revoked in the log as we scan it for recovery, and
* we update the existing records if we find multiple revokes for a
* single block.
*/
int journal_set_revoke(journal_t *journal,
unsigned int blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
record = find_revoke_record(journal, blocknr);
if (record) {
/* If we have multiple occurrences, only record the
* latest sequence number in the hashed record */
if (tid_gt(sequence, record->sequence))
record->sequence = sequence;
return 0;
}
return insert_revoke_hash(journal, blocknr, sequence);
}
/*
* Test revoke records. For a given block referenced in the log, has
* that block been revoked? A revoke record with a given transaction
* sequence number revokes all blocks in that transaction and earlier
* ones, but later transactions still need replayed.
*/
int journal_test_revoke(journal_t *journal,
unsigned int blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
record = find_revoke_record(journal, blocknr);
if (!record)
return 0;
if (tid_gt(sequence, record->sequence))
return 0;
return 1;
}
/*
* Finally, once recovery is over, we need to clear the revoke table so
* that it can be reused by the running filesystem.
*/
void journal_clear_revoke(journal_t *journal)
{
int i;
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
struct jbd_revoke_table_s *revoke;
revoke = journal->j_revoke;
for (i = 0; i < revoke->hash_size; i++) {
hash_list = &revoke->hash_table[i];
while (!list_empty(hash_list)) {
record = (struct jbd_revoke_record_s*) hash_list->next;
list_del(&record->hash);
kmem_cache_free(revoke_record_cache, record);
}
}
}

2237
fs/jbd/transaction.c Normal file

File diff suppressed because it is too large Load diff