Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

View file

@ -0,0 +1,73 @@
#
# DRBD device driver configuration
#
comment "DRBD disabled because PROC_FS or INET not selected"
depends on PROC_FS='n' || INET='n'
config BLK_DEV_DRBD
tristate "DRBD Distributed Replicated Block Device support"
depends on PROC_FS && INET
select LRU_CACHE
select LIBCRC32C
default n
help
NOTE: In order to authenticate connections you have to select
CRYPTO_HMAC and a hash function as well.
DRBD is a shared-nothing, synchronously replicated block device. It
is designed to serve as a building block for high availability
clusters and in this context, is a "drop-in" replacement for shared
storage. Simplistically, you could see it as a network RAID 1.
Each minor device has a role, which can be 'primary' or 'secondary'.
On the node with the primary device the application is supposed to
run and to access the device (/dev/drbdX). Every write is sent to
the local 'lower level block device' and, across the network, to the
node with the device in 'secondary' state. The secondary device
simply writes the data to its lower level block device.
DRBD can also be used in dual-Primary mode (device writable on both
nodes), which means it can exhibit shared disk semantics in a
shared-nothing cluster. Needless to say, on top of dual-Primary
DRBD utilizing a cluster file system is necessary to maintain for
cache coherency.
For automatic failover you need a cluster manager (e.g. heartbeat).
See also: http://www.drbd.org/, http://www.linux-ha.org
If unsure, say N.
config DRBD_FAULT_INJECTION
bool "DRBD fault injection"
depends on BLK_DEV_DRBD
help
Say Y here if you want to simulate IO errors, in order to test DRBD's
behavior.
The actual simulation of IO errors is done by writing 3 values to
/sys/module/drbd/parameters/
enable_faults: bitmask of...
1 meta data write
2 read
4 resync data write
8 read
16 data write
32 data read
64 read ahead
128 kmalloc of bitmap
256 allocation of peer_requests
512 insert data corruption on receiving side
fault_devs: bitmask of minor numbers
fault_rate: frequency in percent
Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
echo 16 > /sys/module/drbd/parameters/enable_faults
echo 1 > /sys/module/drbd/parameters/fault_devs
echo 5 > /sys/module/drbd/parameters/fault_rate
If unsure, say N.

View file

@ -0,0 +1,8 @@
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
drbd-y += drbd_interval.o drbd_state.o
drbd-y += drbd_nla.o
drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,958 @@
#define pr_fmt(fmt) "drbd debugfs: " fmt
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/stat.h>
#include <linux/jiffies.h>
#include <linux/list.h>
#include "drbd_int.h"
#include "drbd_req.h"
#include "drbd_debugfs.h"
/**********************************************************************
* Whenever you change the file format, remember to bump the version. *
**********************************************************************/
static struct dentry *drbd_debugfs_root;
static struct dentry *drbd_debugfs_version;
static struct dentry *drbd_debugfs_resources;
static struct dentry *drbd_debugfs_minors;
static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
{
if (valid)
seq_printf(m, "\t%d", jiffies_to_msecs(dt));
else
seq_printf(m, "\t-");
}
static void __seq_print_rq_state_bit(struct seq_file *m,
bool is_set, char *sep, const char *set_name, const char *unset_name)
{
if (is_set && set_name) {
seq_putc(m, *sep);
seq_puts(m, set_name);
*sep = '|';
} else if (!is_set && unset_name) {
seq_putc(m, *sep);
seq_puts(m, unset_name);
*sep = '|';
}
}
static void seq_print_rq_state_bit(struct seq_file *m,
bool is_set, char *sep, const char *set_name)
{
__seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
}
/* pretty print enum drbd_req_state_bits req->rq_state */
static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
{
unsigned int s = req->rq_state;
char sep = ' ';
seq_printf(m, "\t0x%08x", s);
seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
/* RQ_WRITE ignored, already reported */
seq_puts(m, "\tlocal:");
seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
sep = ' ';
seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
if (sep == ' ')
seq_puts(m, " -");
/* for_each_connection ... */
seq_printf(m, "\tnet:");
sep = ' ';
seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
if (sep == ' ')
seq_puts(m, " -");
seq_printf(m, " :");
sep = ' ';
seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
if (sep == ' ')
seq_puts(m, " -");
seq_printf(m, "\n");
}
static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
{
/* change anything here, fixup header below! */
unsigned int s = req->rq_state;
#define RQ_HDR_1 "epoch\tsector\tsize\trw"
seq_printf(m, "0x%x\t%llu\t%u\t%s",
req->epoch,
(unsigned long long)req->i.sector, req->i.size >> 9,
(s & RQ_WRITE) ? "W" : "R");
#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
#define RQ_HDR_3 "\tsent\tacked\tdone"
seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
#define RQ_HDR_4 "\tstate\n"
seq_print_request_state(m, req);
}
#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
{
seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
seq_print_one_request(m, req, now);
}
static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
{
struct drbd_device *device;
unsigned int i;
seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
rcu_read_lock();
idr_for_each_entry(&resource->devices, device, i) {
struct drbd_md_io tmp;
/* In theory this is racy,
* in the sense that there could have been a
* drbd_md_put_buffer(); drbd_md_get_buffer();
* between accessing these members here. */
tmp = device->md_io;
if (atomic_read(&tmp.in_use)) {
seq_printf(m, "%u\t%u\t%d\t",
device->minor, device->vnr,
jiffies_to_msecs(now - tmp.start_jif));
if (time_before(tmp.submit_jif, tmp.start_jif))
seq_puts(m, "-\t");
else
seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
seq_printf(m, "%s\n", tmp.current_use);
}
}
rcu_read_unlock();
}
static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
{
struct drbd_device *device;
unsigned int i;
seq_puts(m, "minor\tvnr\tage\t#waiting\n");
rcu_read_lock();
idr_for_each_entry(&resource->devices, device, i) {
unsigned long jif;
struct drbd_request *req;
int n = atomic_read(&device->ap_actlog_cnt);
if (n) {
spin_lock_irq(&device->resource->req_lock);
req = list_first_entry_or_null(&device->pending_master_completion[1],
struct drbd_request, req_pending_master_completion);
/* if the oldest request does not wait for the activity log
* it is not interesting for us here */
if (req && !(req->rq_state & RQ_IN_ACT_LOG))
jif = req->start_jif;
else
req = NULL;
spin_unlock_irq(&device->resource->req_lock);
}
if (n) {
seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
if (req)
seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
else
seq_puts(m, "-\t");
seq_printf(m, "%u\n", n);
}
}
rcu_read_unlock();
}
static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
{
struct drbd_bm_aio_ctx *ctx;
unsigned long start_jif;
unsigned int in_flight;
unsigned int flags;
spin_lock_irq(&device->resource->req_lock);
ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
if (ctx && ctx->done)
ctx = NULL;
if (ctx) {
start_jif = ctx->start_jif;
in_flight = atomic_read(&ctx->in_flight);
flags = ctx->flags;
}
spin_unlock_irq(&device->resource->req_lock);
if (ctx) {
seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
device->minor, device->vnr,
(flags & BM_AIO_READ) ? 'R' : 'W',
jiffies_to_msecs(now - start_jif),
in_flight);
}
}
static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
{
struct drbd_device *device;
unsigned int i;
seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
rcu_read_lock();
idr_for_each_entry(&resource->devices, device, i) {
seq_print_device_bitmap_io(m, device, now);
}
rcu_read_unlock();
}
/* pretty print enum peer_req->flags */
static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
{
unsigned long f = peer_req->flags;
char sep = ' ';
__seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
__seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
if (f & EE_IS_TRIM) {
seq_putc(m, sep);
sep = '|';
if (f & EE_IS_TRIM_USE_ZEROOUT)
seq_puts(m, "zero-out");
else
seq_puts(m, "trim");
}
seq_putc(m, '\n');
}
static void seq_print_peer_request(struct seq_file *m,
struct drbd_device *device, struct list_head *lh,
unsigned long now)
{
bool reported_preparing = false;
struct drbd_peer_request *peer_req;
list_for_each_entry(peer_req, lh, w.list) {
if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
continue;
if (device)
seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
seq_printf(m, "%llu\t%u\t%c\t%u\t",
(unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
(peer_req->flags & EE_WRITE) ? 'W' : 'R',
jiffies_to_msecs(now - peer_req->submit_jif));
seq_print_peer_request_flags(m, peer_req);
if (peer_req->flags & EE_SUBMITTED)
break;
else
reported_preparing = true;
}
}
static void seq_print_device_peer_requests(struct seq_file *m,
struct drbd_device *device, unsigned long now)
{
seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
spin_lock_irq(&device->resource->req_lock);
seq_print_peer_request(m, device, &device->active_ee, now);
seq_print_peer_request(m, device, &device->read_ee, now);
seq_print_peer_request(m, device, &device->sync_ee, now);
spin_unlock_irq(&device->resource->req_lock);
if (test_bit(FLUSH_PENDING, &device->flags)) {
seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
device->minor, device->vnr,
jiffies_to_msecs(now - device->flush_jif));
}
}
static void seq_print_resource_pending_peer_requests(struct seq_file *m,
struct drbd_resource *resource, unsigned long now)
{
struct drbd_device *device;
unsigned int i;
rcu_read_lock();
idr_for_each_entry(&resource->devices, device, i) {
seq_print_device_peer_requests(m, device, now);
}
rcu_read_unlock();
}
static void seq_print_resource_transfer_log_summary(struct seq_file *m,
struct drbd_resource *resource,
struct drbd_connection *connection,
unsigned long now)
{
struct drbd_request *req;
unsigned int count = 0;
unsigned int show_state = 0;
seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
spin_lock_irq(&resource->req_lock);
list_for_each_entry(req, &connection->transfer_log, tl_requests) {
unsigned int tmp = 0;
unsigned int s;
++count;
/* don't disable irq "forever" */
if (!(count & 0x1ff)) {
struct drbd_request *req_next;
kref_get(&req->kref);
spin_unlock_irq(&resource->req_lock);
cond_resched();
spin_lock_irq(&resource->req_lock);
req_next = list_next_entry(req, tl_requests);
if (kref_put(&req->kref, drbd_req_destroy))
req = req_next;
if (&req->tl_requests == &connection->transfer_log)
break;
}
s = req->rq_state;
/* This is meant to summarize timing issues, to be able to tell
* local disk problems from network problems.
* Skip requests, if we have shown an even older request with
* similar aspects already. */
if (req->master_bio == NULL)
tmp |= 1;
if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
tmp |= 2;
if (s & RQ_NET_MASK) {
if (!(s & RQ_NET_SENT))
tmp |= 4;
if (s & RQ_NET_PENDING)
tmp |= 8;
if (!(s & RQ_NET_DONE))
tmp |= 16;
}
if ((tmp & show_state) == tmp)
continue;
show_state |= tmp;
seq_printf(m, "%u\t", count);
seq_print_minor_vnr_req(m, req, now);
if (show_state == 0x1f)
break;
}
spin_unlock_irq(&resource->req_lock);
}
/* TODO: transfer_log and friends should be moved to resource */
static int in_flight_summary_show(struct seq_file *m, void *pos)
{
struct drbd_resource *resource = m->private;
struct drbd_connection *connection;
unsigned long jif = jiffies;
connection = first_connection(resource);
/* This does not happen, actually.
* But be robust and prepare for future code changes. */
if (!connection || !kref_get_unless_zero(&connection->kref))
return -ESTALE;
/* BUMP me if you change the file format/content/presentation */
seq_printf(m, "v: %u\n\n", 0);
seq_puts(m, "oldest bitmap IO\n");
seq_print_resource_pending_bitmap_io(m, resource, jif);
seq_putc(m, '\n');
seq_puts(m, "meta data IO\n");
seq_print_resource_pending_meta_io(m, resource, jif);
seq_putc(m, '\n');
seq_puts(m, "socket buffer stats\n");
/* for each connection ... once we have more than one */
rcu_read_lock();
if (connection->data.socket) {
/* open coded SIOCINQ, the "relevant" part */
struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
int answ = tp->rcv_nxt - tp->copied_seq;
seq_printf(m, "unread receive buffer: %u Byte\n", answ);
/* open coded SIOCOUTQ, the "relevant" part */
answ = tp->write_seq - tp->snd_una;
seq_printf(m, "unacked send buffer: %u Byte\n", answ);
}
rcu_read_unlock();
seq_putc(m, '\n');
seq_puts(m, "oldest peer requests\n");
seq_print_resource_pending_peer_requests(m, resource, jif);
seq_putc(m, '\n');
seq_puts(m, "application requests waiting for activity log\n");
seq_print_waiting_for_AL(m, resource, jif);
seq_putc(m, '\n');
seq_puts(m, "oldest application requests\n");
seq_print_resource_transfer_log_summary(m, resource, connection, jif);
seq_putc(m, '\n');
jif = jiffies - jif;
if (jif)
seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
kref_put(&connection->kref, drbd_destroy_connection);
return 0;
}
/* simple_positive(file->f_dentry) respectively debugfs_positive(),
* but neither is "reachable" from here.
* So we have our own inline version of it above. :-( */
static inline int debugfs_positive(struct dentry *dentry)
{
return dentry->d_inode && !d_unhashed(dentry);
}
/* make sure at *open* time that the respective object won't go away. */
static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
void *data, struct kref *kref,
void (*release)(struct kref *))
{
struct dentry *parent;
int ret = -ESTALE;
/* Are we still linked,
* or has debugfs_remove() already been called? */
parent = file->f_dentry->d_parent;
/* not sure if this can happen: */
if (!parent || !parent->d_inode)
goto out;
/* serialize with d_delete() */
mutex_lock(&parent->d_inode->i_mutex);
/* Make sure the object is still alive */
if (debugfs_positive(file->f_dentry)
&& kref_get_unless_zero(kref))
ret = 0;
mutex_unlock(&parent->d_inode->i_mutex);
if (!ret) {
ret = single_open(file, show, data);
if (ret)
kref_put(kref, release);
}
out:
return ret;
}
static int in_flight_summary_open(struct inode *inode, struct file *file)
{
struct drbd_resource *resource = inode->i_private;
return drbd_single_open(file, in_flight_summary_show, resource,
&resource->kref, drbd_destroy_resource);
}
static int in_flight_summary_release(struct inode *inode, struct file *file)
{
struct drbd_resource *resource = inode->i_private;
kref_put(&resource->kref, drbd_destroy_resource);
return single_release(inode, file);
}
static const struct file_operations in_flight_summary_fops = {
.owner = THIS_MODULE,
.open = in_flight_summary_open,
.read = seq_read,
.llseek = seq_lseek,
.release = in_flight_summary_release,
};
void drbd_debugfs_resource_add(struct drbd_resource *resource)
{
struct dentry *dentry;
if (!drbd_debugfs_resources)
return;
dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
if (IS_ERR_OR_NULL(dentry))
goto fail;
resource->debugfs_res = dentry;
dentry = debugfs_create_dir("volumes", resource->debugfs_res);
if (IS_ERR_OR_NULL(dentry))
goto fail;
resource->debugfs_res_volumes = dentry;
dentry = debugfs_create_dir("connections", resource->debugfs_res);
if (IS_ERR_OR_NULL(dentry))
goto fail;
resource->debugfs_res_connections = dentry;
dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
resource->debugfs_res, resource,
&in_flight_summary_fops);
if (IS_ERR_OR_NULL(dentry))
goto fail;
resource->debugfs_res_in_flight_summary = dentry;
return;
fail:
drbd_debugfs_resource_cleanup(resource);
drbd_err(resource, "failed to create debugfs dentry\n");
}
static void drbd_debugfs_remove(struct dentry **dp)
{
debugfs_remove(*dp);
*dp = NULL;
}
void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
{
/* it is ok to call debugfs_remove(NULL) */
drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
drbd_debugfs_remove(&resource->debugfs_res_connections);
drbd_debugfs_remove(&resource->debugfs_res_volumes);
drbd_debugfs_remove(&resource->debugfs_res);
}
static void seq_print_one_timing_detail(struct seq_file *m,
const struct drbd_thread_timing_details *tdp,
unsigned long now)
{
struct drbd_thread_timing_details td;
/* No locking...
* use temporary assignment to get at consistent data. */
do {
td = *tdp;
} while (td.cb_nr != tdp->cb_nr);
if (!td.cb_addr)
return;
seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
td.cb_nr,
jiffies_to_msecs(now - td.start_jif),
td.caller_fn, td.line,
td.cb_addr);
}
static void seq_print_timing_details(struct seq_file *m,
const char *title,
unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
{
unsigned int start_idx;
unsigned int i;
seq_printf(m, "%s\n", title);
/* If not much is going on, this will result in natural ordering.
* If it is very busy, we will possibly skip events, or even see wrap
* arounds, which could only be avoided with locking.
*/
start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
seq_print_one_timing_detail(m, tdp+i, now);
for (i = 0; i < start_idx; i++)
seq_print_one_timing_detail(m, tdp+i, now);
}
static int callback_history_show(struct seq_file *m, void *ignored)
{
struct drbd_connection *connection = m->private;
unsigned long jif = jiffies;
/* BUMP me if you change the file format/content/presentation */
seq_printf(m, "v: %u\n\n", 0);
seq_puts(m, "n\tage\tcallsite\tfn\n");
seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
return 0;
}
static int callback_history_open(struct inode *inode, struct file *file)
{
struct drbd_connection *connection = inode->i_private;
return drbd_single_open(file, callback_history_show, connection,
&connection->kref, drbd_destroy_connection);
}
static int callback_history_release(struct inode *inode, struct file *file)
{
struct drbd_connection *connection = inode->i_private;
kref_put(&connection->kref, drbd_destroy_connection);
return single_release(inode, file);
}
static const struct file_operations connection_callback_history_fops = {
.owner = THIS_MODULE,
.open = callback_history_open,
.read = seq_read,
.llseek = seq_lseek,
.release = callback_history_release,
};
static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
{
struct drbd_connection *connection = m->private;
unsigned long now = jiffies;
struct drbd_request *r1, *r2;
/* BUMP me if you change the file format/content/presentation */
seq_printf(m, "v: %u\n\n", 0);
spin_lock_irq(&connection->resource->req_lock);
r1 = connection->req_next;
if (r1)
seq_print_minor_vnr_req(m, r1, now);
r2 = connection->req_ack_pending;
if (r2 && r2 != r1) {
r1 = r2;
seq_print_minor_vnr_req(m, r1, now);
}
r2 = connection->req_not_net_done;
if (r2 && r2 != r1)
seq_print_minor_vnr_req(m, r2, now);
spin_unlock_irq(&connection->resource->req_lock);
return 0;
}
static int connection_oldest_requests_open(struct inode *inode, struct file *file)
{
struct drbd_connection *connection = inode->i_private;
return drbd_single_open(file, connection_oldest_requests_show, connection,
&connection->kref, drbd_destroy_connection);
}
static int connection_oldest_requests_release(struct inode *inode, struct file *file)
{
struct drbd_connection *connection = inode->i_private;
kref_put(&connection->kref, drbd_destroy_connection);
return single_release(inode, file);
}
static const struct file_operations connection_oldest_requests_fops = {
.owner = THIS_MODULE,
.open = connection_oldest_requests_open,
.read = seq_read,
.llseek = seq_lseek,
.release = connection_oldest_requests_release,
};
void drbd_debugfs_connection_add(struct drbd_connection *connection)
{
struct dentry *conns_dir = connection->resource->debugfs_res_connections;
struct dentry *dentry;
if (!conns_dir)
return;
/* Once we enable mutliple peers,
* these connections will have descriptive names.
* For now, it is just the one connection to the (only) "peer". */
dentry = debugfs_create_dir("peer", conns_dir);
if (IS_ERR_OR_NULL(dentry))
goto fail;
connection->debugfs_conn = dentry;
dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
connection->debugfs_conn, connection,
&connection_callback_history_fops);
if (IS_ERR_OR_NULL(dentry))
goto fail;
connection->debugfs_conn_callback_history = dentry;
dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
connection->debugfs_conn, connection,
&connection_oldest_requests_fops);
if (IS_ERR_OR_NULL(dentry))
goto fail;
connection->debugfs_conn_oldest_requests = dentry;
return;
fail:
drbd_debugfs_connection_cleanup(connection);
drbd_err(connection, "failed to create debugfs dentry\n");
}
void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
{
drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
drbd_debugfs_remove(&connection->debugfs_conn);
}
static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
{
struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
seq_printf(m, "%5d %s %s %s", bme->rs_left,
test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
);
}
static int device_resync_extents_show(struct seq_file *m, void *ignored)
{
struct drbd_device *device = m->private;
/* BUMP me if you change the file format/content/presentation */
seq_printf(m, "v: %u\n\n", 0);
if (get_ldev_if_state(device, D_FAILED)) {
lc_seq_printf_stats(m, device->resync);
lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
put_ldev(device);
}
return 0;
}
static int device_act_log_extents_show(struct seq_file *m, void *ignored)
{
struct drbd_device *device = m->private;
/* BUMP me if you change the file format/content/presentation */
seq_printf(m, "v: %u\n\n", 0);
if (get_ldev_if_state(device, D_FAILED)) {
lc_seq_printf_stats(m, device->act_log);
lc_seq_dump_details(m, device->act_log, "", NULL);
put_ldev(device);
}
return 0;
}
static int device_oldest_requests_show(struct seq_file *m, void *ignored)
{
struct drbd_device *device = m->private;
struct drbd_resource *resource = device->resource;
unsigned long now = jiffies;
struct drbd_request *r1, *r2;
int i;
/* BUMP me if you change the file format/content/presentation */
seq_printf(m, "v: %u\n\n", 0);
seq_puts(m, RQ_HDR);
spin_lock_irq(&resource->req_lock);
/* WRITE, then READ */
for (i = 1; i >= 0; --i) {
r1 = list_first_entry_or_null(&device->pending_master_completion[i],
struct drbd_request, req_pending_master_completion);
r2 = list_first_entry_or_null(&device->pending_completion[i],
struct drbd_request, req_pending_local);
if (r1)
seq_print_one_request(m, r1, now);
if (r2 && r2 != r1)
seq_print_one_request(m, r2, now);
}
spin_unlock_irq(&resource->req_lock);
return 0;
}
static int device_data_gen_id_show(struct seq_file *m, void *ignored)
{
struct drbd_device *device = m->private;
struct drbd_md *md;
enum drbd_uuid_index idx;
if (!get_ldev_if_state(device, D_FAILED))
return -ENODEV;
md = &device->ldev->md;
spin_lock_irq(&md->uuid_lock);
for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
seq_printf(m, "0x%016llX\n", md->uuid[idx]);
}
spin_unlock_irq(&md->uuid_lock);
put_ldev(device);
return 0;
}
#define drbd_debugfs_device_attr(name) \
static int device_ ## name ## _open(struct inode *inode, struct file *file) \
{ \
struct drbd_device *device = inode->i_private; \
return drbd_single_open(file, device_ ## name ## _show, device, \
&device->kref, drbd_destroy_device); \
} \
static int device_ ## name ## _release(struct inode *inode, struct file *file) \
{ \
struct drbd_device *device = inode->i_private; \
kref_put(&device->kref, drbd_destroy_device); \
return single_release(inode, file); \
} \
static const struct file_operations device_ ## name ## _fops = { \
.owner = THIS_MODULE, \
.open = device_ ## name ## _open, \
.read = seq_read, \
.llseek = seq_lseek, \
.release = device_ ## name ## _release, \
};
drbd_debugfs_device_attr(oldest_requests)
drbd_debugfs_device_attr(act_log_extents)
drbd_debugfs_device_attr(resync_extents)
drbd_debugfs_device_attr(data_gen_id)
void drbd_debugfs_device_add(struct drbd_device *device)
{
struct dentry *vols_dir = device->resource->debugfs_res_volumes;
char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
char vnr_buf[8]; /* volume number vnr is even 16 bit only; */
char *slink_name = NULL;
struct dentry *dentry;
if (!vols_dir || !drbd_debugfs_minors)
return;
snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
dentry = debugfs_create_dir(vnr_buf, vols_dir);
if (IS_ERR_OR_NULL(dentry))
goto fail;
device->debugfs_vol = dentry;
snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
device->resource->name, device->vnr);
if (!slink_name)
goto fail;
dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
kfree(slink_name);
slink_name = NULL;
if (IS_ERR_OR_NULL(dentry))
goto fail;
device->debugfs_minor = dentry;
#define DCF(name) do { \
dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \
device->debugfs_vol, device, \
&device_ ## name ## _fops); \
if (IS_ERR_OR_NULL(dentry)) \
goto fail; \
device->debugfs_vol_ ## name = dentry; \
} while (0)
DCF(oldest_requests);
DCF(act_log_extents);
DCF(resync_extents);
DCF(data_gen_id);
#undef DCF
return;
fail:
drbd_debugfs_device_cleanup(device);
drbd_err(device, "failed to create debugfs entries\n");
}
void drbd_debugfs_device_cleanup(struct drbd_device *device)
{
drbd_debugfs_remove(&device->debugfs_minor);
drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
drbd_debugfs_remove(&device->debugfs_vol);
}
void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
{
struct dentry *conn_dir = peer_device->connection->debugfs_conn;
struct dentry *dentry;
char vnr_buf[8];
if (!conn_dir)
return;
snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
dentry = debugfs_create_dir(vnr_buf, conn_dir);
if (IS_ERR_OR_NULL(dentry))
goto fail;
peer_device->debugfs_peer_dev = dentry;
return;
fail:
drbd_debugfs_peer_device_cleanup(peer_device);
drbd_err(peer_device, "failed to create debugfs entries\n");
}
void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
{
drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
}
static int drbd_version_show(struct seq_file *m, void *ignored)
{
seq_printf(m, "# %s\n", drbd_buildtag());
seq_printf(m, "VERSION=%s\n", REL_VERSION);
seq_printf(m, "API_VERSION=%u\n", API_VERSION);
seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
return 0;
}
static int drbd_version_open(struct inode *inode, struct file *file)
{
return single_open(file, drbd_version_show, NULL);
}
static struct file_operations drbd_version_fops = {
.owner = THIS_MODULE,
.open = drbd_version_open,
.llseek = seq_lseek,
.read = seq_read,
.release = single_release,
};
/* not __exit, may be indirectly called
* from the module-load-failure path as well. */
void drbd_debugfs_cleanup(void)
{
drbd_debugfs_remove(&drbd_debugfs_resources);
drbd_debugfs_remove(&drbd_debugfs_minors);
drbd_debugfs_remove(&drbd_debugfs_version);
drbd_debugfs_remove(&drbd_debugfs_root);
}
int __init drbd_debugfs_init(void)
{
struct dentry *dentry;
dentry = debugfs_create_dir("drbd", NULL);
if (IS_ERR_OR_NULL(dentry))
goto fail;
drbd_debugfs_root = dentry;
dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
if (IS_ERR_OR_NULL(dentry))
goto fail;
drbd_debugfs_version = dentry;
dentry = debugfs_create_dir("resources", drbd_debugfs_root);
if (IS_ERR_OR_NULL(dentry))
goto fail;
drbd_debugfs_resources = dentry;
dentry = debugfs_create_dir("minors", drbd_debugfs_root);
if (IS_ERR_OR_NULL(dentry))
goto fail;
drbd_debugfs_minors = dentry;
return 0;
fail:
drbd_debugfs_cleanup();
if (dentry)
return PTR_ERR(dentry);
else
return -EINVAL;
}

View file

@ -0,0 +1,39 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/debugfs.h>
#include "drbd_int.h"
#ifdef CONFIG_DEBUG_FS
int __init drbd_debugfs_init(void);
void drbd_debugfs_cleanup(void);
void drbd_debugfs_resource_add(struct drbd_resource *resource);
void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
void drbd_debugfs_connection_add(struct drbd_connection *connection);
void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
void drbd_debugfs_device_add(struct drbd_device *device);
void drbd_debugfs_device_cleanup(struct drbd_device *device);
void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
#else
static inline int __init drbd_debugfs_init(void) { return -ENODEV; }
static inline void drbd_debugfs_cleanup(void) { }
static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,179 @@
#include <asm/bug.h>
#include <linux/rbtree_augmented.h>
#include "drbd_interval.h"
/**
* interval_end - return end of @node
*/
static inline
sector_t interval_end(struct rb_node *node)
{
struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
return this->end;
}
/**
* compute_subtree_last - compute end of @node
*
* The end of an interval is the highest (start + (size >> 9)) value of this
* node and of its children. Called for @node and its parents whenever the end
* may have changed.
*/
static inline sector_t
compute_subtree_last(struct drbd_interval *node)
{
sector_t max = node->sector + (node->size >> 9);
if (node->rb.rb_left) {
sector_t left = interval_end(node->rb.rb_left);
if (left > max)
max = left;
}
if (node->rb.rb_right) {
sector_t right = interval_end(node->rb.rb_right);
if (right > max)
max = right;
}
return max;
}
RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb,
sector_t, end, compute_subtree_last);
/**
* drbd_insert_interval - insert a new interval into a tree
*/
bool
drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
{
struct rb_node **new = &root->rb_node, *parent = NULL;
sector_t this_end = this->sector + (this->size >> 9);
BUG_ON(!IS_ALIGNED(this->size, 512));
while (*new) {
struct drbd_interval *here =
rb_entry(*new, struct drbd_interval, rb);
parent = *new;
if (here->end < this_end)
here->end = this_end;
if (this->sector < here->sector)
new = &(*new)->rb_left;
else if (this->sector > here->sector)
new = &(*new)->rb_right;
else if (this < here)
new = &(*new)->rb_left;
else if (this > here)
new = &(*new)->rb_right;
else
return false;
}
this->end = this_end;
rb_link_node(&this->rb, parent, new);
rb_insert_augmented(&this->rb, root, &augment_callbacks);
return true;
}
/**
* drbd_contains_interval - check if a tree contains a given interval
* @sector: start sector of @interval
* @interval: may not be a valid pointer
*
* Returns if the tree contains the node @interval with start sector @start.
* Does not dereference @interval until @interval is known to be a valid object
* in @tree. Returns %false if @interval is in the tree but with a different
* sector number.
*/
bool
drbd_contains_interval(struct rb_root *root, sector_t sector,
struct drbd_interval *interval)
{
struct rb_node *node = root->rb_node;
while (node) {
struct drbd_interval *here =
rb_entry(node, struct drbd_interval, rb);
if (sector < here->sector)
node = node->rb_left;
else if (sector > here->sector)
node = node->rb_right;
else if (interval < here)
node = node->rb_left;
else if (interval > here)
node = node->rb_right;
else
return true;
}
return false;
}
/**
* drbd_remove_interval - remove an interval from a tree
*/
void
drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
{
rb_erase_augmented(&this->rb, root, &augment_callbacks);
}
/**
* drbd_find_overlap - search for an interval overlapping with [sector, sector + size)
* @sector: start sector
* @size: size, aligned to 512 bytes
*
* Returns an interval overlapping with [sector, sector + size), or NULL if
* there is none. When there is more than one overlapping interval in the
* tree, the interval with the lowest start sector is returned, and all other
* overlapping intervals will be on the right side of the tree, reachable with
* rb_next().
*/
struct drbd_interval *
drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
{
struct rb_node *node = root->rb_node;
struct drbd_interval *overlap = NULL;
sector_t end = sector + (size >> 9);
BUG_ON(!IS_ALIGNED(size, 512));
while (node) {
struct drbd_interval *here =
rb_entry(node, struct drbd_interval, rb);
if (node->rb_left &&
sector < interval_end(node->rb_left)) {
/* Overlap if any must be on left side */
node = node->rb_left;
} else if (here->sector < end &&
sector < here->sector + (here->size >> 9)) {
overlap = here;
break;
} else if (sector >= here->sector) {
/* Overlap if any must be on right side */
node = node->rb_right;
} else
break;
}
return overlap;
}
struct drbd_interval *
drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
{
sector_t end = sector + (size >> 9);
struct rb_node *node;
for (;;) {
node = rb_next(&i->rb);
if (!node)
return NULL;
i = rb_entry(node, struct drbd_interval, rb);
if (i->sector >= end)
return NULL;
if (sector < i->sector + (i->size >> 9))
return i;
}
}

View file

@ -0,0 +1,42 @@
#ifndef __DRBD_INTERVAL_H
#define __DRBD_INTERVAL_H
#include <linux/types.h>
#include <linux/rbtree.h>
struct drbd_interval {
struct rb_node rb;
sector_t sector; /* start sector of the interval */
unsigned int size; /* size in bytes */
sector_t end; /* highest interval end in subtree */
int local:1 /* local or remote request? */;
int waiting:1; /* someone is waiting for this to complete */
int completed:1; /* this has been completed already;
* ignore for conflict detection */
};
static inline void drbd_clear_interval(struct drbd_interval *i)
{
RB_CLEAR_NODE(&i->rb);
}
static inline bool drbd_interval_empty(struct drbd_interval *i)
{
return RB_EMPTY_NODE(&i->rb);
}
extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
extern bool drbd_contains_interval(struct rb_root *, sector_t,
struct drbd_interval *);
extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
unsigned int);
extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
unsigned int);
#define drbd_for_each_overlap(i, root, sector, size) \
for (i = drbd_find_overlap(root, sector, size); \
i; \
i = drbd_next_overlap(i, sector, size))
#endif /* __DRBD_INTERVAL_H */

File diff suppressed because it is too large Load diff

3682
drivers/block/drbd/drbd_nl.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,54 @@
#include <linux/kernel.h>
#include <net/netlink.h>
#include <linux/drbd_genl_api.h>
#include "drbd_nla.h"
static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
{
struct nlattr *head = nla_data(nla);
int len = nla_len(nla);
int rem;
/*
* validate_nla (called from nla_parse_nested) ignores attributes
* beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
* In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
* flag set also, check and remove that flag before calling
* nla_parse_nested.
*/
nla_for_each_attr(nla, head, len, rem) {
if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
if (nla_type(nla) > maxtype)
return -EOPNOTSUPP;
}
}
return 0;
}
int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy)
{
int err;
err = drbd_nla_check_mandatory(maxtype, nla);
if (!err)
err = nla_parse_nested(tb, maxtype, nla, policy);
return err;
}
struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
{
int err;
/*
* If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
* we don't know about that attribute, reject all the nested
* attributes.
*/
err = drbd_nla_check_mandatory(maxtype, nla);
if (err)
return ERR_PTR(err);
return nla_find_nested(nla, attrtype);
}

View file

@ -0,0 +1,8 @@
#ifndef __DRBD_NLA_H
#define __DRBD_NLA_H
extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy);
extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
#endif /* __DRBD_NLA_H */

View file

@ -0,0 +1,368 @@
/*
drbd_proc.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/module.h>
#include <asm/uaccess.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/drbd.h>
#include "drbd_int.h"
static int drbd_proc_open(struct inode *inode, struct file *file);
static int drbd_proc_release(struct inode *inode, struct file *file);
struct proc_dir_entry *drbd_proc;
const struct file_operations drbd_proc_fops = {
.owner = THIS_MODULE,
.open = drbd_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = drbd_proc_release,
};
static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
{
/* v is in kB/sec. We don't expect TiByte/sec yet. */
if (unlikely(v >= 1000000)) {
/* cool: > GiByte/s */
seq_printf(seq, "%ld,", v / 1000000);
v %= 1000000;
seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
} else if (likely(v >= 1000))
seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
else
seq_printf(seq, "%ld", v);
}
static void drbd_get_syncer_progress(struct drbd_device *device,
union drbd_dev_state state, unsigned long *rs_total,
unsigned long *bits_left, unsigned int *per_mil_done)
{
/* this is to break it at compile time when we change that, in case we
* want to support more than (1<<32) bits on a 32bit arch. */
typecheck(unsigned long, device->rs_total);
*rs_total = device->rs_total;
/* note: both rs_total and rs_left are in bits, i.e. in
* units of BM_BLOCK_SIZE.
* for the percentage, we don't care. */
if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
*bits_left = device->ov_left;
else
*bits_left = drbd_bm_total_weight(device) - device->rs_failed;
/* >> 10 to prevent overflow,
* +1 to prevent division by zero */
if (*bits_left > *rs_total) {
/* D'oh. Maybe a logic bug somewhere. More likely just a race
* between state change and reset of rs_total.
*/
*bits_left = *rs_total;
*per_mil_done = *rs_total ? 0 : 1000;
} else {
/* Make sure the division happens in long context.
* We allow up to one petabyte storage right now,
* at a granularity of 4k per bit that is 2**38 bits.
* After shift right and multiplication by 1000,
* this should still fit easily into a 32bit long,
* so we don't need a 64bit division on 32bit arch.
* Note: currently we don't support such large bitmaps on 32bit
* arch anyways, but no harm done to be prepared for it here.
*/
unsigned int shift = *rs_total > UINT_MAX ? 16 : 10;
unsigned long left = *bits_left >> shift;
unsigned long total = 1UL + (*rs_total >> shift);
unsigned long tmp = 1000UL - left * 1000UL/total;
*per_mil_done = tmp;
}
}
/*lge
* progress bars shamelessly adapted from driver/md/md.c
* output looks like
* [=====>..............] 33.5% (23456/123456)
* finish: 2:20:20 speed: 6,345 (6,456) K/sec
*/
static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq,
union drbd_dev_state state)
{
unsigned long db, dt, dbdt, rt, rs_total, rs_left;
unsigned int res;
int i, x, y;
int stalled = 0;
drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res);
x = res/50;
y = 20-x;
seq_printf(seq, "\t[");
for (i = 1; i < x; i++)
seq_printf(seq, "=");
seq_printf(seq, ">");
for (i = 0; i < y; i++)
seq_printf(seq, ".");
seq_printf(seq, "] ");
if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
seq_printf(seq, "verified:");
else
seq_printf(seq, "sync'ed:");
seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
/* if more than a few GB, display in MB */
if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
seq_printf(seq, "(%lu/%lu)M",
(unsigned long) Bit2KB(rs_left >> 10),
(unsigned long) Bit2KB(rs_total >> 10));
else
seq_printf(seq, "(%lu/%lu)K",
(unsigned long) Bit2KB(rs_left),
(unsigned long) Bit2KB(rs_total));
seq_printf(seq, "\n\t");
/* see drivers/md/md.c
* We do not want to overflow, so the order of operands and
* the * 100 / 100 trick are important. We do a +1 to be
* safe against division by zero. We only estimate anyway.
*
* dt: time from mark until now
* db: blocks written from mark until now
* rt: remaining time
*/
/* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is
* at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
* least DRBD_SYNC_MARK_STEP time before it will be modified. */
/* ------------------------ ~18s average ------------------------ */
i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS;
dt = (jiffies - device->rs_mark_time[i]) / HZ;
if (dt > 180)
stalled = 1;
if (!dt)
dt++;
db = device->rs_mark_left[i] - rs_left;
rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
seq_printf(seq, "finish: %lu:%02lu:%02lu",
rt / 3600, (rt % 3600) / 60, rt % 60);
dbdt = Bit2KB(db/dt);
seq_printf(seq, " speed: ");
seq_printf_with_thousands_grouping(seq, dbdt);
seq_printf(seq, " (");
/* ------------------------- ~3s average ------------------------ */
if (proc_details >= 1) {
/* this is what drbd_rs_should_slow_down() uses */
i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
dt = (jiffies - device->rs_mark_time[i]) / HZ;
if (!dt)
dt++;
db = device->rs_mark_left[i] - rs_left;
dbdt = Bit2KB(db/dt);
seq_printf_with_thousands_grouping(seq, dbdt);
seq_printf(seq, " -- ");
}
/* --------------------- long term average ---------------------- */
/* mean speed since syncer started
* we do account for PausedSync periods */
dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
if (dt == 0)
dt = 1;
db = rs_total - rs_left;
dbdt = Bit2KB(db/dt);
seq_printf_with_thousands_grouping(seq, dbdt);
seq_printf(seq, ")");
if (state.conn == C_SYNC_TARGET ||
state.conn == C_VERIFY_S) {
seq_printf(seq, " want: ");
seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
}
seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
if (proc_details >= 1) {
/* 64 bit:
* we convert to sectors in the display below. */
unsigned long bm_bits = drbd_bm_bits(device);
unsigned long bit_pos;
unsigned long long stop_sector = 0;
if (state.conn == C_VERIFY_S ||
state.conn == C_VERIFY_T) {
bit_pos = bm_bits - device->ov_left;
if (verify_can_do_stop_sector(device))
stop_sector = device->ov_stop_sector;
} else
bit_pos = device->bm_resync_fo;
/* Total sectors may be slightly off for oddly
* sized devices. So what. */
seq_printf(seq,
"\t%3d%% sector pos: %llu/%llu",
(int)(bit_pos / (bm_bits/100+1)),
(unsigned long long)bit_pos * BM_SECT_PER_BIT,
(unsigned long long)bm_bits * BM_SECT_PER_BIT);
if (stop_sector != 0 && stop_sector != ULLONG_MAX)
seq_printf(seq, " stop sector: %llu", stop_sector);
seq_printf(seq, "\n");
}
}
static int drbd_seq_show(struct seq_file *seq, void *v)
{
int i, prev_i = -1;
const char *sn;
struct drbd_device *device;
struct net_conf *nc;
union drbd_dev_state state;
char wp;
static char write_ordering_chars[] = {
[WO_none] = 'n',
[WO_drain_io] = 'd',
[WO_bdev_flush] = 'f',
};
seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
/*
cs .. connection state
ro .. node role (local/remote)
ds .. disk state (local/remote)
protocol
various flags
ns .. network send
nr .. network receive
dw .. disk write
dr .. disk read
al .. activity log write count
bm .. bitmap update write count
pe .. pending (waiting for ack or data reply)
ua .. unack'd (still need to send ack or data reply)
ap .. application requests accepted, but not yet completed
ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
wo .. write ordering mode currently in use
oos .. known out-of-sync kB
*/
rcu_read_lock();
idr_for_each_entry(&drbd_devices, device, i) {
if (prev_i != i - 1)
seq_printf(seq, "\n");
prev_i = i;
state = device->state;
sn = drbd_conn_str(state.conn);
if (state.conn == C_STANDALONE &&
state.disk == D_DISKLESS &&
state.role == R_SECONDARY) {
seq_printf(seq, "%2d: cs:Unconfigured\n", i);
} else {
/* reset device->congestion_reason */
bdi_rw_congested(&device->rq_queue->backing_dev_info);
nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
"lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
i, sn,
drbd_role_str(state.role),
drbd_role_str(state.peer),
drbd_disk_str(state.disk),
drbd_disk_str(state.pdsk),
wp,
drbd_suspended(device) ? 's' : 'r',
state.aftr_isp ? 'a' : '-',
state.peer_isp ? 'p' : '-',
state.user_isp ? 'u' : '-',
device->congestion_reason ?: '-',
test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
device->send_cnt/2,
device->recv_cnt/2,
device->writ_cnt/2,
device->read_cnt/2,
device->al_writ_cnt,
device->bm_writ_cnt,
atomic_read(&device->local_cnt),
atomic_read(&device->ap_pending_cnt) +
atomic_read(&device->rs_pending_cnt),
atomic_read(&device->unacked_cnt),
atomic_read(&device->ap_bio_cnt),
first_peer_device(device)->connection->epochs,
write_ordering_chars[device->resource->write_ordering]
);
seq_printf(seq, " oos:%llu\n",
Bit2KB((unsigned long long)
drbd_bm_total_weight(device)));
}
if (state.conn == C_SYNC_SOURCE ||
state.conn == C_SYNC_TARGET ||
state.conn == C_VERIFY_S ||
state.conn == C_VERIFY_T)
drbd_syncer_progress(device, seq, state);
if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
lc_seq_printf_stats(seq, device->resync);
lc_seq_printf_stats(seq, device->act_log);
put_ldev(device);
}
if (proc_details >= 2)
seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
}
rcu_read_unlock();
return 0;
}
static int drbd_proc_open(struct inode *inode, struct file *file)
{
int err;
if (try_module_get(THIS_MODULE)) {
err = single_open(file, drbd_seq_show, NULL);
if (err)
module_put(THIS_MODULE);
return err;
}
return -ENODEV;
}
static int drbd_proc_release(struct inode *inode, struct file *file)
{
module_put(THIS_MODULE);
return single_release(inode, file);
}
/* PROC FS stuff end */

View file

@ -0,0 +1,307 @@
#ifndef __DRBD_PROTOCOL_H
#define __DRBD_PROTOCOL_H
enum drbd_packet {
/* receiver (data socket) */
P_DATA = 0x00,
P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
P_BARRIER = 0x03,
P_BITMAP = 0x04,
P_BECOME_SYNC_TARGET = 0x05,
P_BECOME_SYNC_SOURCE = 0x06,
P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
P_SYNC_PARAM = 0x0a,
P_PROTOCOL = 0x0b,
P_UUIDS = 0x0c,
P_SIZES = 0x0d,
P_STATE = 0x0e,
P_SYNC_UUID = 0x0f,
P_AUTH_CHALLENGE = 0x10,
P_AUTH_RESPONSE = 0x11,
P_STATE_CHG_REQ = 0x12,
/* asender (meta socket */
P_PING = 0x13,
P_PING_ACK = 0x14,
P_RECV_ACK = 0x15, /* Used in protocol B */
P_WRITE_ACK = 0x16, /* Used in protocol C */
P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */
P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
P_BARRIER_ACK = 0x1c,
P_STATE_CHG_REPLY = 0x1d,
/* "new" commands, no longer fitting into the ordering scheme above */
P_OV_REQUEST = 0x1e, /* data socket */
P_OV_REPLY = 0x1f,
P_OV_RESULT = 0x20, /* meta socket */
P_CSUM_RS_REQUEST = 0x21, /* data socket */
P_RS_IS_IN_SYNC = 0x22, /* meta socket */
P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
/* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */
/* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */
P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */
P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */
P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */
P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */
P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */
/* 0x2e to 0x30 reserved, used in drbd 9 */
/* REQ_DISCARD. We used "discard" in different contexts before,
* which is why I chose TRIM here, to disambiguate. */
P_TRIM = 0x31,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,
/* special command ids for handshake */
P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */
P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */
P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */
};
#ifndef __packed
#define __packed __attribute__((packed))
#endif
/* This is the layout for a packet on the wire.
* The byteorder is the network byte order.
* (except block_id and barrier fields.
* these are pointers to local structs
* and have no relevance for the partner,
* which just echoes them as received.)
*
* NOTE that the payload starts at a long aligned offset,
* regardless of 32 or 64 bit arch!
*/
struct p_header80 {
u32 magic;
u16 command;
u16 length; /* bytes of data after this header */
} __packed;
/* Header for big packets, Used for data packets exceeding 64kB */
struct p_header95 {
u16 magic; /* use DRBD_MAGIC_BIG here */
u16 command;
u32 length;
} __packed;
struct p_header100 {
u32 magic;
u16 volume;
u16 command;
u32 length;
u32 pad;
} __packed;
/* these defines must not be changed without changing the protocol version */
#define DP_HARDBARRIER 1 /* depricated */
#define DP_RW_SYNC 2 /* equals REQ_SYNC */
#define DP_MAY_SET_IN_SYNC 4
#define DP_UNPLUG 8 /* not used anymore */
#define DP_FUA 16 /* equals REQ_FUA */
#define DP_FLUSH 32 /* equals REQ_FLUSH */
#define DP_DISCARD 64 /* equals REQ_DISCARD */
#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
struct p_data {
u64 sector; /* 64 bits sector number */
u64 block_id; /* to identify the request in protocol B&C */
u32 seq_num;
u32 dp_flags;
} __packed;
struct p_trim {
struct p_data p_data;
u32 size; /* == bio->bi_size */
} __packed;
/*
* commands which share a struct:
* p_block_ack:
* P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
* P_SUPERSEDED (proto C, two-primaries conflict detection)
* p_block_req:
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
struct p_block_ack {
u64 sector;
u64 block_id;
u32 blksize;
u32 seq_num;
} __packed;
struct p_block_req {
u64 sector;
u64 block_id;
u32 blksize;
u32 pad; /* to multiple of 8 Byte */
} __packed;
/*
* commands with their own struct for additional fields:
* P_CONNECTION_FEATURES
* P_BARRIER
* P_BARRIER_ACK
* P_SYNC_PARAM
* ReportParams
*/
#define FF_TRIM 1
struct p_connection_features {
u32 protocol_min;
u32 feature_flags;
u32 protocol_max;
/* should be more than enough for future enhancements
* for now, feature_flags and the reserved array shall be zero.
*/
u32 _pad;
u64 reserved[7];
} __packed;
struct p_barrier {
u32 barrier; /* barrier number _handle_ only */
u32 pad; /* to multiple of 8 Byte */
} __packed;
struct p_barrier_ack {
u32 barrier;
u32 set_size;
} __packed;
struct p_rs_param {
u32 resync_rate;
/* Since protocol version 88 and higher. */
char verify_alg[0];
} __packed;
struct p_rs_param_89 {
u32 resync_rate;
/* protocol version 89: */
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
} __packed;
struct p_rs_param_95 {
u32 resync_rate;
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
u32 c_plan_ahead;
u32 c_delay_target;
u32 c_fill_target;
u32 c_max_rate;
} __packed;
enum drbd_conn_flags {
CF_DISCARD_MY_DATA = 1,
CF_DRY_RUN = 2,
};
struct p_protocol {
u32 protocol;
u32 after_sb_0p;
u32 after_sb_1p;
u32 after_sb_2p;
u32 conn_flags;
u32 two_primaries;
/* Since protocol version 87 and higher. */
char integrity_alg[0];
} __packed;
struct p_uuids {
u64 uuid[UI_EXTENDED_SIZE];
} __packed;
struct p_rs_uuid {
u64 uuid;
} __packed;
struct p_sizes {
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
u64 c_size; /* current exported size */
u32 max_bio_size; /* Maximal size of a BIO */
u16 queue_order_type; /* not yet implemented in DRBD*/
u16 dds_flags; /* use enum dds_flags here. */
} __packed;
struct p_state {
u32 state;
} __packed;
struct p_req_state {
u32 mask;
u32 val;
} __packed;
struct p_req_state_reply {
u32 retcode;
} __packed;
struct p_drbd06_param {
u64 size;
u32 state;
u32 blksize;
u32 protocol;
u32 version;
u32 gen_cnt[5];
u32 bit_map_gen[5];
} __packed;
struct p_block_desc {
u64 sector;
u32 blksize;
u32 pad; /* to multiple of 8 Byte */
} __packed;
/* Valid values for the encoding field.
* Bump proto version when changing this. */
enum drbd_bitmap_code {
/* RLE_VLI_Bytes = 0,
* and other bit variants had been defined during
* algorithm evaluation. */
RLE_VLI_Bits = 2,
};
struct p_compressed_bm {
/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
* (encoding & 0x80): polarity (set/unset) of first runlength
* ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
* used to pad up to head.length bytes
*/
u8 encoding;
u8 code[0];
} __packed;
struct p_delay_probe93 {
u32 seq_num; /* sequence number to match the two probe packets */
u32 offset; /* usecs the probe got sent after the reference time point */
} __packed;
/*
* Bitmap packets need to fit within a single page on the sender and receiver,
* so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
*/
#define DRBD_SOCKET_BUFFER_SIZE 4096
#endif /* __DRBD_PROTOCOL_H */

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,351 @@
/*
drbd_req.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
DRBD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
DRBD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DRBD_REQ_H
#define _DRBD_REQ_H
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/drbd.h>
#include "drbd_int.h"
/* The request callbacks will be called in irq context by the IDE drivers,
and in Softirqs/Tasklets/BH context by the SCSI drivers,
and by the receiver and worker in kernel-thread context.
Try to get the locking right :) */
/*
* Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
* associated with IO requests originating from the block layer above us.
*
* There are quite a few things that may happen to a drbd request
* during its lifetime.
*
* It will be created.
* It will be marked with the intention to be
* submitted to local disk and/or
* send via the network.
*
* It has to be placed on the transfer log and other housekeeping lists,
* In case we have a network connection.
*
* It may be identified as a concurrent (write) request
* and be handled accordingly.
*
* It may me handed over to the local disk subsystem.
* It may be completed by the local disk subsystem,
* either successfully or with io-error.
* In case it is a READ request, and it failed locally,
* it may be retried remotely.
*
* It may be queued for sending.
* It may be handed over to the network stack,
* which may fail.
* It may be acknowledged by the "peer" according to the wire_protocol in use.
* this may be a negative ack.
* It may receive a faked ack when the network connection is lost and the
* transfer log is cleaned up.
* Sending may be canceled due to network connection loss.
* When it finally has outlived its time,
* corresponding dirty bits in the resync-bitmap may be cleared or set,
* it will be destroyed,
* and completion will be signalled to the originator,
* with or without "success".
*/
enum drbd_req_event {
CREATED,
TO_BE_SENT,
TO_BE_SUBMITTED,
/* XXX yes, now I am inconsistent...
* these are not "events" but "actions"
* oh, well... */
QUEUE_FOR_NET_WRITE,
QUEUE_FOR_NET_READ,
QUEUE_FOR_SEND_OOS,
/* An empty flush is queued as P_BARRIER,
* which will cause it to complete "successfully",
* even if the local disk flush failed.
*
* Just like "real" requests, empty flushes (blkdev_issue_flush()) will
* only see an error if neither local nor remote data is reachable. */
QUEUE_AS_DRBD_BARRIER,
SEND_CANCELED,
SEND_FAILED,
HANDED_OVER_TO_NETWORK,
OOS_HANDED_TO_NETWORK,
CONNECTION_LOST_WHILE_PENDING,
READ_RETRY_REMOTE_CANCELED,
RECV_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
CONFLICT_RESOLVED,
POSTPONE_WRITE,
NEG_ACKED,
BARRIER_ACKED, /* in protocol A and B */
DATA_RECEIVED, /* (remote read) */
COMPLETED_OK,
READ_COMPLETED_WITH_ERROR,
READ_AHEAD_COMPLETED_WITH_ERROR,
WRITE_COMPLETED_WITH_ERROR,
DISCARD_COMPLETED_NOTSUPP,
DISCARD_COMPLETED_WITH_ERROR,
ABORT_DISK_IO,
RESEND,
FAIL_FROZEN_DISK_IO,
RESTART_FROZEN_DISK_IO,
NOTHING,
};
/* encoding of request states for now. we don't actually need that many bits.
* we don't need to do atomic bit operations either, since most of the time we
* need to look at the connection state and/or manipulate some lists at the
* same time, so we should hold the request lock anyways.
*/
enum drbd_req_state_bits {
/* 3210
* 0000: no local possible
* 0001: to be submitted
* UNUSED, we could map: 011: submitted, completion still pending
* 0110: completed ok
* 0010: completed with error
* 1001: Aborted (before completion)
* 1x10: Aborted and completed -> free
*/
__RQ_LOCAL_PENDING,
__RQ_LOCAL_COMPLETED,
__RQ_LOCAL_OK,
__RQ_LOCAL_ABORTED,
/* 87654
* 00000: no network possible
* 00001: to be send
* 00011: to be send, on worker queue
* 00101: sent, expecting recv_ack (B) or write_ack (C)
* 11101: sent,
* recv_ack (B) or implicit "ack" (A),
* still waiting for the barrier ack.
* master_bio may already be completed and invalidated.
* 11100: write acked (C),
* data received (for remote read, any protocol)
* or finally the barrier ack has arrived (B,A)...
* request can be freed
* 01100: neg-acked (write, protocol C)
* or neg-d-acked (read, any protocol)
* or killed from the transfer log
* during cleanup after connection loss
* request can be freed
* 01000: canceled or send failed...
* request can be freed
*/
/* if "SENT" is not set, yet, this can still fail or be canceled.
* if "SENT" is set already, we still wait for an Ack packet.
* when cleared, the master_bio may be completed.
* in (B,A) the request object may still linger on the transaction log
* until the corresponding barrier ack comes in */
__RQ_NET_PENDING,
/* If it is QUEUED, and it is a WRITE, it is also registered in the
* transfer log. Currently we need this flag to avoid conflicts between
* worker canceling the request and tl_clear_barrier killing it from
* transfer log. We should restructure the code so this conflict does
* no longer occur. */
__RQ_NET_QUEUED,
/* well, actually only "handed over to the network stack".
*
* TODO can potentially be dropped because of the similar meaning
* of RQ_NET_SENT and ~RQ_NET_QUEUED.
* however it is not exactly the same. before we drop it
* we must ensure that we can tell a request with network part
* from a request without, regardless of what happens to it. */
__RQ_NET_SENT,
/* when set, the request may be freed (if RQ_NET_QUEUED is clear).
* basically this means the corresponding P_BARRIER_ACK was received */
__RQ_NET_DONE,
/* whether or not we know (C) or pretend (B,A) that the write
* was successfully written on the peer.
*/
__RQ_NET_OK,
/* peer called drbd_set_in_sync() for this write */
__RQ_NET_SIS,
/* keep this last, its for the RQ_NET_MASK */
__RQ_NET_MAX,
/* Set when this is a write, clear for a read */
__RQ_WRITE,
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
/* The peer has sent a retry ACK */
__RQ_POSTPONED,
/* would have been completed,
* but was not, because of drbd_suspended() */
__RQ_COMPLETION_SUSP,
/* We expect a receive ACK (wire proto B) */
__RQ_EXP_RECEIVE_ACK,
/* We expect a write ACK (wite proto C) */
__RQ_EXP_WRITE_ACK,
/* waiting for a barrier ack, did an extra kref_get */
__RQ_EXP_BARR_ACK,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED)
#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1)
#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
#define RQ_NET_OK (1UL << __RQ_NET_OK)
#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
/* 0x1f8 */
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK)
/* For waking up the frozen transfer log mod_req() has to return if the request
should be counted in the epoch object*/
#define MR_WRITE 1
#define MR_READ 2
static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
{
struct bio *bio;
bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
req->private_bio = bio;
bio->bi_private = req;
bio->bi_end_io = drbd_request_endio;
bio->bi_next = NULL;
}
/* Short lived temporary struct on the stack.
* We could squirrel the error to be returned into
* bio->bi_iter.bi_size, or similar. But that would be too ugly. */
struct bio_and_error {
struct bio *bio;
int error;
};
extern void start_new_tl_epoch(struct drbd_connection *connection);
extern void drbd_req_destroy(struct kref *kref);
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m);
extern void complete_master_bio(struct drbd_device *device,
struct bio_and_error *m);
extern void request_timer_fn(unsigned long data);
extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
extern void tl_abort_disk_io(struct drbd_device *device);
/* this is in drbd_main.c */
extern void drbd_restart_request(struct drbd_request *req);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
struct drbd_device *device = req->device;
struct bio_and_error m;
int rv;
/* __req_mod possibly frees req, do not touch req after that! */
rv = __req_mod(req, what, &m);
if (m.bio)
complete_master_bio(device, &m);
return rv;
}
/* completion of master bio is outside of our spinlock.
* We still may or may not be inside some irqs disabled section
* of the lower level driver completion callback, so we need to
* spin_lock_irqsave here. */
static inline int req_mod(struct drbd_request *req,
enum drbd_req_event what)
{
unsigned long flags;
struct drbd_device *device = req->device;
struct bio_and_error m;
int rv;
spin_lock_irqsave(&device->resource->req_lock, flags);
rv = __req_mod(req, what, &m);
spin_unlock_irqrestore(&device->resource->req_lock, flags);
if (m.bio)
complete_master_bio(device, &m);
return rv;
}
static inline bool drbd_should_do_remote(union drbd_dev_state s)
{
return s.pdsk == D_UP_TO_DATE ||
(s.pdsk >= D_INCONSISTENT &&
s.conn >= C_WF_BITMAP_T &&
s.conn < C_AHEAD);
/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
states. */
}
static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
{
return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
since we enter state C_AHEAD only if proto >= 96 */
}
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,161 @@
#ifndef DRBD_STATE_H
#define DRBD_STATE_H
struct drbd_device;
struct drbd_connection;
/**
* DOC: DRBD State macros
*
* These macros are used to express state changes in easily readable form.
*
* The NS macros expand to a mask and a value, that can be bit ored onto the
* current state as soon as the spinlock (req_lock) was taken.
*
* The _NS macros are used for state functions that get called with the
* spinlock. These macros expand directly to the new state value.
*
* Besides the basic forms NS() and _NS() additional _?NS[23] are defined
* to express state changes that affect more than one aspect of the state.
*
* E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
* Means that the network connection was established and that the peer
* is in secondary role.
*/
#define role_MASK R_MASK
#define peer_MASK R_MASK
#define disk_MASK D_MASK
#define pdsk_MASK D_MASK
#define conn_MASK C_MASK
#define susp_MASK 1
#define user_isp_MASK 1
#define aftr_isp_MASK 1
#define susp_nod_MASK 1
#define susp_fen_MASK 1
#define NS(T, S) \
({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T = (S); val; })
#define NS2(T1, S1, T2, S2) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val; })
#define NS3(T1, S1, T2, S2, T3, S3) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val.T3 = (S3); val; })
#define _NS(D, T, S) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
#define _NS2(D, T1, S1, T2, S2) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
__ns.T2 = (S2); __ns; })
#define _NS3(D, T1, S1, T2, S2, T3, S3) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
enum chg_state_flags {
CS_HARD = 1 << 0,
CS_VERBOSE = 1 << 1,
CS_WAIT_COMPLETE = 1 << 2,
CS_SERIALIZE = 1 << 3,
CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */
CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */
CS_DC_PEER = 1 << 6,
CS_DC_CONN = 1 << 7,
CS_DC_DISK = 1 << 8,
CS_DC_PDSK = 1 << 9,
CS_DC_SUSP = 1 << 10,
CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
CS_IGN_OUTD_FAIL = 1 << 11,
};
/* drbd_dev_state and drbd_state are different types. This is to stress the
small difference. There is no suspended flag (.susp), and no suspended
while fence handler runs flas (susp_fen). */
union drbd_dev_state {
struct {
#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned conn:5 ; /* 17/32 cstates */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned _unused:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned peer_isp:1 ;
unsigned user_isp:1 ;
unsigned _pad:11; /* 0 unused */
#elif defined(__BIG_ENDIAN_BITFIELD)
unsigned _pad:11;
unsigned user_isp:1 ;
unsigned peer_isp:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned _unused:1 ;
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned conn:5 ; /* 17/32 cstates */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
#else
# error "this endianess is not supported"
#endif
};
unsigned int i;
};
extern enum drbd_state_rv drbd_change_state(struct drbd_device *device,
enum chg_state_flags f,
union drbd_state mask,
union drbd_state val);
extern void drbd_force_state(struct drbd_device *, union drbd_state,
union drbd_state);
extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
union drbd_state,
union drbd_state,
enum chg_state_flags);
extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
enum chg_state_flags,
struct completion *done);
extern void print_st_err(struct drbd_device *, union drbd_state,
union drbd_state, int);
enum drbd_state_rv
_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags);
enum drbd_state_rv
conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags);
extern void drbd_resume_al(struct drbd_device *device);
extern bool conn_all_vols_unconf(struct drbd_connection *connection);
/**
* drbd_request_state() - Reqest a state change
* @device: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.
*
* This is the most graceful way of requesting a state change. It is verbose
* quite verbose in case the state change is not possible, and all those
* state changes are globally serialized.
*/
static inline int drbd_request_state(struct drbd_device *device,
union drbd_state mask,
union drbd_state val)
{
return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED);
}
enum drbd_role conn_highest_role(struct drbd_connection *connection);
enum drbd_role conn_highest_peer(struct drbd_connection *connection);
enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection);
enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection);
enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection);
enum drbd_conns conn_lowest_conn(struct drbd_connection *connection);
#endif

View file

@ -0,0 +1,118 @@
/*
drbd.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/drbd.h>
#include "drbd_strings.h"
static const char *drbd_conn_s_names[] = {
[C_STANDALONE] = "StandAlone",
[C_DISCONNECTING] = "Disconnecting",
[C_UNCONNECTED] = "Unconnected",
[C_TIMEOUT] = "Timeout",
[C_BROKEN_PIPE] = "BrokenPipe",
[C_NETWORK_FAILURE] = "NetworkFailure",
[C_PROTOCOL_ERROR] = "ProtocolError",
[C_WF_CONNECTION] = "WFConnection",
[C_WF_REPORT_PARAMS] = "WFReportParams",
[C_TEAR_DOWN] = "TearDown",
[C_CONNECTED] = "Connected",
[C_STARTING_SYNC_S] = "StartingSyncS",
[C_STARTING_SYNC_T] = "StartingSyncT",
[C_WF_BITMAP_S] = "WFBitMapS",
[C_WF_BITMAP_T] = "WFBitMapT",
[C_WF_SYNC_UUID] = "WFSyncUUID",
[C_SYNC_SOURCE] = "SyncSource",
[C_SYNC_TARGET] = "SyncTarget",
[C_PAUSED_SYNC_S] = "PausedSyncS",
[C_PAUSED_SYNC_T] = "PausedSyncT",
[C_VERIFY_S] = "VerifyS",
[C_VERIFY_T] = "VerifyT",
[C_AHEAD] = "Ahead",
[C_BEHIND] = "Behind",
};
static const char *drbd_role_s_names[] = {
[R_PRIMARY] = "Primary",
[R_SECONDARY] = "Secondary",
[R_UNKNOWN] = "Unknown"
};
static const char *drbd_disk_s_names[] = {
[D_DISKLESS] = "Diskless",
[D_ATTACHING] = "Attaching",
[D_FAILED] = "Failed",
[D_NEGOTIATING] = "Negotiating",
[D_INCONSISTENT] = "Inconsistent",
[D_OUTDATED] = "Outdated",
[D_UNKNOWN] = "DUnknown",
[D_CONSISTENT] = "Consistent",
[D_UP_TO_DATE] = "UpToDate",
};
static const char *drbd_state_sw_errors[] = {
[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
[-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
[-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
[-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
[-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
[-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
[-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
[-SS_DEVICE_IN_USE] = "Device is held open by someone",
[-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
[-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
[-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
[-SS_NOT_SUPPORTED] = "Peer does not support protocol",
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
[-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
};
const char *drbd_conn_str(enum drbd_conns s)
{
/* enums are unsigned... */
return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
}
const char *drbd_role_str(enum drbd_role s)
{
return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
}
const char *drbd_disk_str(enum drbd_disk_state s)
{
return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
}
const char *drbd_set_st_err_str(enum drbd_state_rv err)
{
return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
err > SS_TWO_PRIMARIES ? "TOO_LARGE"
: drbd_state_sw_errors[-err];
}

View file

@ -0,0 +1,9 @@
#ifndef __DRBD_STRINGS_H
#define __DRBD_STRINGS_H
extern const char *drbd_conn_str(enum drbd_conns);
extern const char *drbd_role_str(enum drbd_role);
extern const char *drbd_disk_str(enum drbd_disk_state);
extern const char *drbd_set_st_err_str(enum drbd_state_rv);
#endif /* __DRBD_STRINGS_H */

View file

@ -0,0 +1,351 @@
/*
-*- linux-c -*-
drbd_receiver.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DRBD_VLI_H
#define _DRBD_VLI_H
/*
* At a granularity of 4KiB storage represented per bit,
* and stroage sizes of several TiB,
* and possibly small-bandwidth replication,
* the bitmap transfer time can take much too long,
* if transmitted in plain text.
*
* We try to reduce the transferred bitmap information
* by encoding runlengths of bit polarity.
*
* We never actually need to encode a "zero" (runlengths are positive).
* But then we have to store the value of the first bit.
* The first bit of information thus shall encode if the first runlength
* gives the number of set or unset bits.
*
* We assume that large areas are either completely set or unset,
* which gives good compression with any runlength method,
* even when encoding the runlength as fixed size 32bit/64bit integers.
*
* Still, there may be areas where the polarity flips every few bits,
* and encoding the runlength sequence of those areas with fix size
* integers would be much worse than plaintext.
*
* We want to encode small runlength values with minimum code length,
* while still being able to encode a Huge run of all zeros.
*
* Thus we need a Variable Length Integer encoding, VLI.
*
* For some cases, we produce more code bits than plaintext input.
* We need to send incompressible chunks as plaintext, skip over them
* and then see if the next chunk compresses better.
*
* We don't care too much about "excellent" compression ratio for large
* runlengths (all set/all clear): whether we achieve a factor of 100
* or 1000 is not that much of an issue.
* We do not want to waste too much on short runlengths in the "noisy"
* parts of the bitmap, though.
*
* There are endless variants of VLI, we experimented with:
* * simple byte-based
* * various bit based with different code word length.
*
* To avoid yet an other configuration parameter (choice of bitmap compression
* algorithm) which was difficult to explain and tune, we just chose the one
* variant that turned out best in all test cases.
* Based on real world usage patterns, with device sizes ranging from a few GiB
* to several TiB, file server/mailserver/webserver/mysql/postgress,
* mostly idle to really busy, the all time winner (though sometimes only
* marginally better) is:
*/
/*
* encoding is "visualised" as
* __little endian__ bitstream, least significant bit first (left most)
*
* this particular encoding is chosen so that the prefix code
* starts as unary encoding the level, then modified so that
* 10 levels can be described in 8bit, with minimal overhead
* for the smaller levels.
*
* Number of data bits follow fibonacci sequence, with the exception of the
* last level (+1 data bit, so it makes 64bit total). The only worse code when
* encoding bit polarity runlength is 1 plain bits => 2 code bits.
prefix data bits max val data bits
0 x 0x2 1
10 x 0x4 1
110 xx 0x8 2
1110 xxx 0x10 3
11110 xxx xx 0x30 5
111110 xx xxxxxx 0x130 8
11111100 xxxxxxxx xxxxx 0x2130 13
11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
* maximum encodable value: 0x100000400202130 == 2**56 + some */
/* compression "table":
transmitted x 0.29
as plaintext x ........................
x ........................
x ........................
x 0.59 0.21........................
x ........................................................
x .. c ...................................................
x 0.44.. o ...................................................
x .......... d ...................................................
x .......... e ...................................................
X............. ...................................................
x.............. b ...................................................
2.0x............... i ...................................................
#X................ t ...................................................
#................. s ........................... plain bits ..........
-+-----------------------------------------------------------------------
1 16 32 64
*/
/* LEVEL: (total bits, prefix bits, prefix value),
* sorted ascending by number of total bits.
* The rest of the code table is calculated at compiletime from this. */
/* fibonacci data 1, 1, ... */
#define VLI_L_1_1() do { \
LEVEL( 2, 1, 0x00); \
LEVEL( 3, 2, 0x01); \
LEVEL( 5, 3, 0x03); \
LEVEL( 7, 4, 0x07); \
LEVEL(10, 5, 0x0f); \
LEVEL(14, 6, 0x1f); \
LEVEL(21, 8, 0x3f); \
LEVEL(29, 8, 0x7f); \
LEVEL(42, 8, 0xbf); \
LEVEL(64, 8, 0xff); \
} while (0)
/* finds a suitable level to decode the least significant part of in.
* returns number of bits consumed.
*
* BUG() for bad input, as that would mean a buggy code table. */
static inline int vli_decode_bits(u64 *out, const u64 in)
{
u64 adj = 1;
#define LEVEL(t,b,v) \
do { \
if ((in & ((1 << b) -1)) == v) { \
*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
return t; \
} \
adj += 1ULL << (t - b); \
} while (0)
VLI_L_1_1();
/* NOT REACHED, if VLI_LEVELS code table is defined properly */
BUG();
#undef LEVEL
}
/* return number of code bits needed,
* or negative error number */
static inline int __vli_encode_bits(u64 *out, const u64 in)
{
u64 max = 0;
u64 adj = 1;
if (in == 0)
return -EINVAL;
#define LEVEL(t,b,v) do { \
max += 1ULL << (t - b); \
if (in <= max) { \
if (out) \
*out = ((in - adj) << b) | v; \
return t; \
} \
adj = max + 1; \
} while (0)
VLI_L_1_1();
return -EOVERFLOW;
#undef LEVEL
}
#undef VLI_L_1_1
/* code from here down is independend of actually used bit code */
/*
* Code length is determined by some unique (e.g. unary) prefix.
* This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
* not a byte stream.
*/
/* for the bitstream, we need a cursor */
struct bitstream_cursor {
/* the current byte */
u8 *b;
/* the current bit within *b, nomalized: 0..7 */
unsigned int bit;
};
/* initialize cursor to point to first bit of stream */
static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
{
cur->b = s;
cur->bit = 0;
}
/* advance cursor by that many bits; maximum expected input value: 64,
* but depending on VLI implementation, it may be more. */
static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
{
bits += cur->bit;
cur->b = cur->b + (bits >> 3);
cur->bit = bits & 7;
}
/* the bitstream itself knows its length */
struct bitstream {
struct bitstream_cursor cur;
unsigned char *buf;
size_t buf_len; /* in bytes */
/* for input stream:
* number of trailing 0 bits for padding
* total number of valid bits in stream: buf_len * 8 - pad_bits */
unsigned int pad_bits;
};
static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
{
bs->buf = s;
bs->buf_len = len;
bs->pad_bits = pad_bits;
bitstream_cursor_reset(&bs->cur, bs->buf);
}
static inline void bitstream_rewind(struct bitstream *bs)
{
bitstream_cursor_reset(&bs->cur, bs->buf);
memset(bs->buf, 0, bs->buf_len);
}
/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
* Ignores "pad_bits".
* Returns zero if bits == 0 (nothing to do).
* Returns number of bits used if successful.
*
* If there is not enough room left in bitstream,
* leaves bitstream unchanged and returns -ENOBUFS.
*/
static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
{
unsigned char *b = bs->cur.b;
unsigned int tmp;
if (bits == 0)
return 0;
if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
return -ENOBUFS;
/* paranoia: strip off hi bits; they should not be set anyways. */
if (bits < 64)
val &= ~0ULL >> (64 - bits);
*b++ |= (val & 0xff) << bs->cur.bit;
for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
*b++ |= (val >> tmp) & 0xff;
bitstream_cursor_advance(&bs->cur, bits);
return bits;
}
/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
*
* If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
*
* If there are less than the requested number of valid bits left in the
* bitstream, still fetches all available bits.
*
* Returns number of actually fetched bits.
*/
static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
{
u64 val;
unsigned int n;
if (bits > 64)
return -EINVAL;
if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
- bs->cur.bit - bs->pad_bits;
if (bits == 0) {
*out = 0;
return 0;
}
/* get the high bits */
val = 0;
n = (bs->cur.bit + bits + 7) >> 3;
/* n may be at most 9, if cur.bit + bits > 64 */
/* which means this copies at most 8 byte */
if (n) {
memcpy(&val, bs->cur.b+1, n - 1);
val = le64_to_cpu(val) << (8 - bs->cur.bit);
}
/* we still need the low bits */
val |= bs->cur.b[0] >> bs->cur.bit;
/* and mask out bits we don't want */
val &= ~0ULL >> (64 - bits);
bitstream_cursor_advance(&bs->cur, bits);
*out = val;
return bits;
}
/* encodes @in as vli into @bs;
* return values
* > 0: number of bits successfully stored in bitstream
* -ENOBUFS @bs is full
* -EINVAL input zero (invalid)
* -EOVERFLOW input too large for this vli code (invalid)
*/
static inline int vli_encode_bits(struct bitstream *bs, u64 in)
{
u64 code = code;
int bits = __vli_encode_bits(&code, in);
if (bits <= 0)
return bits;
return bitstream_put_bits(bs, code, bits);
}
#endif

File diff suppressed because it is too large Load diff